From b1b33713d64cad90e0659362e8dcb37a207d1868 Mon Sep 17 00:00:00 2001 From: Sergey Stepanov Date: Wed, 4 Oct 2023 15:14:31 +0300 Subject: [PATCH] Add the initial libyuv support The main benefit of libyuv, apart from shortening the video pipeline, is quite noticeable latency and CPU usage decrease due to various assembler/SIMD optimizations of the library. However, there is a drawback for macOS systems: libyuv cannot be downloaded as a compiled library and can only be built from the source, which means we should include a cropped source code of the library (~10K LoC) into the app or rise the complexity of macOS dev and run toolchains. The main target system -- Linux, and Windows will use compiled lib from the package managers and macOS will use the lib included as a shortened source-code. Building the app with the no_libyuv tag will force it to use libyuv from the provided source files. --- .github/workflows/build.yml | 29 +- .../workflows/cd/cloudretro.io/config.yaml | 6 +- Dockerfile | 1 + Makefile | 5 + README.md | 4 +- pkg/config/config.yaml | 16 +- pkg/config/emulator.go | 2 +- pkg/config/shared.go | 9 +- pkg/config/worker.go | 5 +- pkg/encoder/color/bgra/bgra.go | 56 + pkg/encoder/color/rgb565/rgb565.go | 62 + pkg/encoder/color/rgba/rgba.go | 24 + pkg/encoder/encoder.go | 80 +- pkg/encoder/yuv/libyuv/LICENSE | 29 + pkg/encoder/yuv/libyuv/basic_types.h | 29 + pkg/encoder/yuv/libyuv/convert.c | 336 +++ pkg/encoder/yuv/libyuv/convert.h | 113 + pkg/encoder/yuv/libyuv/convert_argb.h | 24 + pkg/encoder/yuv/libyuv/convert_to_i420.c | 116 + pkg/encoder/yuv/libyuv/cpu_id.c | 204 ++ pkg/encoder/yuv/libyuv/cpu_id.h | 106 + pkg/encoder/yuv/libyuv/libyuv.go | 142 + pkg/encoder/yuv/libyuv/libyuv2.go | 89 + pkg/encoder/yuv/libyuv/planar_functions.c | 68 + pkg/encoder/yuv/libyuv/planar_functions.h | 46 + pkg/encoder/yuv/libyuv/rotate.c | 217 ++ pkg/encoder/yuv/libyuv/rotate.h | 79 + pkg/encoder/yuv/libyuv/rotate_any.c | 54 + pkg/encoder/yuv/libyuv/rotate_common.c | 77 + pkg/encoder/yuv/libyuv/rotate_gcc.c | 370 +++ pkg/encoder/yuv/libyuv/rotate_row.h | 106 + pkg/encoder/yuv/libyuv/row.h | 426 +++ pkg/encoder/yuv/libyuv/row_any.c | 206 ++ pkg/encoder/yuv/libyuv/row_common.c | 887 ++++++ pkg/encoder/yuv/libyuv/row_gcc.c | 1090 +++++++ pkg/encoder/yuv/libyuv/scale.c | 946 ++++++ pkg/encoder/yuv/libyuv/scale.h | 53 + pkg/encoder/yuv/libyuv/scale_any.c | 632 ++++ pkg/encoder/yuv/libyuv/scale_common.c | 930 ++++++ pkg/encoder/yuv/libyuv/scale_gcc.c | 2651 +++++++++++++++++ pkg/encoder/yuv/libyuv/scale_row.h | 768 +++++ pkg/encoder/yuv/libyuv/version.h | 16 + pkg/encoder/yuv/libyuv/video_common.c | 50 + pkg/encoder/yuv/libyuv/video_common.h | 212 ++ pkg/encoder/yuv/yuv.c | 130 - pkg/encoder/yuv/yuv.go | 153 +- pkg/encoder/yuv/yuv.h | 18 - pkg/encoder/yuv/yuv_test.go | 340 ++- pkg/worker/caged/app/app.go | 10 +- pkg/worker/caged/libretro/caged.go | 6 +- pkg/worker/caged/libretro/frontend.go | 49 +- pkg/worker/caged/libretro/frontend_test.go | 289 +- pkg/worker/caged/libretro/image/canvas.c | 88 - pkg/worker/caged/libretro/image/canvas.go | 159 - pkg/worker/caged/libretro/image/canvas.h | 27 - .../caged/libretro/image/canvas_test.go | 340 --- pkg/worker/caged/libretro/manager/http.go | 22 +- .../caged/libretro/nanoarch/nanoarch.go | 67 +- pkg/worker/caged/libretro/recording.go | 31 +- pkg/worker/coordinatorhandlers.go | 1 + pkg/worker/media/media.go | 38 +- pkg/worker/media/media_test.go | 9 +- pkg/worker/recorder/ffmpegmux.go | 28 +- pkg/worker/recorder/options.go | 20 +- pkg/worker/recorder/pngstream.go | 72 - pkg/worker/recorder/rawstream.go | 66 + pkg/worker/recorder/recorder.go | 10 +- pkg/worker/recorder/recorder_test.go | 62 +- pkg/worker/room/room_test.go | 107 +- pkg/worker/thread/mainthread_darwin_test.go | 16 +- test/test.go | 17 + .../raw/000_name_fourcc_width_height_stride | 0 .../raw/001_alsa_ABGR_256_240_1024.raw.zip | Bin 0 -> 3748 bytes 73 files changed, 12010 insertions(+), 1536 deletions(-) create mode 100644 pkg/encoder/color/bgra/bgra.go create mode 100644 pkg/encoder/color/rgb565/rgb565.go create mode 100644 pkg/encoder/color/rgba/rgba.go create mode 100644 pkg/encoder/yuv/libyuv/LICENSE create mode 100644 pkg/encoder/yuv/libyuv/basic_types.h create mode 100644 pkg/encoder/yuv/libyuv/convert.c create mode 100644 pkg/encoder/yuv/libyuv/convert.h create mode 100644 pkg/encoder/yuv/libyuv/convert_argb.h create mode 100644 pkg/encoder/yuv/libyuv/convert_to_i420.c create mode 100644 pkg/encoder/yuv/libyuv/cpu_id.c create mode 100644 pkg/encoder/yuv/libyuv/cpu_id.h create mode 100644 pkg/encoder/yuv/libyuv/libyuv.go create mode 100644 pkg/encoder/yuv/libyuv/libyuv2.go create mode 100644 pkg/encoder/yuv/libyuv/planar_functions.c create mode 100644 pkg/encoder/yuv/libyuv/planar_functions.h create mode 100644 pkg/encoder/yuv/libyuv/rotate.c create mode 100644 pkg/encoder/yuv/libyuv/rotate.h create mode 100644 pkg/encoder/yuv/libyuv/rotate_any.c create mode 100644 pkg/encoder/yuv/libyuv/rotate_common.c create mode 100644 pkg/encoder/yuv/libyuv/rotate_gcc.c create mode 100644 pkg/encoder/yuv/libyuv/rotate_row.h create mode 100644 pkg/encoder/yuv/libyuv/row.h create mode 100644 pkg/encoder/yuv/libyuv/row_any.c create mode 100644 pkg/encoder/yuv/libyuv/row_common.c create mode 100644 pkg/encoder/yuv/libyuv/row_gcc.c create mode 100644 pkg/encoder/yuv/libyuv/scale.c create mode 100644 pkg/encoder/yuv/libyuv/scale.h create mode 100644 pkg/encoder/yuv/libyuv/scale_any.c create mode 100644 pkg/encoder/yuv/libyuv/scale_common.c create mode 100644 pkg/encoder/yuv/libyuv/scale_gcc.c create mode 100644 pkg/encoder/yuv/libyuv/scale_row.h create mode 100644 pkg/encoder/yuv/libyuv/version.h create mode 100644 pkg/encoder/yuv/libyuv/video_common.c create mode 100644 pkg/encoder/yuv/libyuv/video_common.h delete mode 100644 pkg/encoder/yuv/yuv.c delete mode 100644 pkg/encoder/yuv/yuv.h delete mode 100644 pkg/worker/caged/libretro/image/canvas.c delete mode 100644 pkg/worker/caged/libretro/image/canvas.go delete mode 100644 pkg/worker/caged/libretro/image/canvas.h delete mode 100644 pkg/worker/caged/libretro/image/canvas_test.go delete mode 100644 pkg/worker/recorder/pngstream.go create mode 100644 pkg/worker/recorder/rawstream.go create mode 100644 test/test.go create mode 100644 test/testdata/raw/000_name_fourcc_width_height_stride create mode 100644 test/testdata/raw/001_alsa_ABGR_256_240_1024.raw.zip diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 8a156e0f9..ccae921f1 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,5 +1,5 @@ # ------------------------------------------------------------ -# Build workflow (Linux x64, macOS x64, Windows x64) +# Build and test workflow (Linux x64, macOS x64, Windows x64) # ------------------------------------------------------------ name: build @@ -20,7 +20,7 @@ jobs: strategy: matrix: os: [ ubuntu-latest, macos-latest, windows-latest ] - step: [ build, check ] + step: [ build, test ] runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v3 @@ -33,7 +33,7 @@ jobs: if: matrix.os == 'ubuntu-latest' run: | sudo apt-get -qq update - sudo apt-get -qq install -y make pkg-config libvpx-dev libx264-dev libopus-dev libsdl2-dev libgl1-mesa-glx + sudo apt-get -qq install -y make pkg-config libvpx-dev libx264-dev libopus-dev libsdl2-dev libyuv-dev libgl1-mesa-glx - name: Get MacOS dev libraries and tools if: matrix.os == 'macos-latest' @@ -55,9 +55,10 @@ jobs: mingw-w64-x86_64-opus mingw-w64-x86_64-x264-git mingw-w64-x86_64-SDL2 + mingw-w64-x86_64-libyuv - name: Get Windows OpenGL drivers - if: matrix.step == 'check' && matrix.os == 'windows-latest' + if: matrix.step == 'test' && matrix.os == 'windows-latest' shell: msys2 {0} run: | wget -q https://github.com/pal1000/mesa-dist-win/releases/download/20.2.1/mesa3d-20.2.1-release-mingw.7z @@ -81,28 +82,28 @@ jobs: run: | make build - - name: Verify core rendering (windows-latest) - if: matrix.step == 'check' && matrix.os == 'windows-latest' && always() + - name: Test (windows-latest) + if: matrix.step == 'test' && matrix.os == 'windows-latest' && always() shell: msys2 {0} env: MESA_GL_VERSION_OVERRIDE: 3.3COMPAT run: | - GL_CTX=-autoGlContext make verify-cores + GL_CTX=-autoGlContext make test verify-cores - - name: Verify core rendering (ubuntu-latest) - if: matrix.step == 'check' && matrix.os == 'ubuntu-latest' && always() + - name: Test (ubuntu-latest) + if: matrix.step == 'test' && matrix.os == 'ubuntu-latest' && always() env: MESA_GL_VERSION_OVERRIDE: 3.3COMPAT run: | - GL_CTX=-autoGlContext xvfb-run --auto-servernum make verify-cores + GL_CTX=-autoGlContext xvfb-run --auto-servernum make test verify-cores - - name: Verify core rendering (macos-latest) - if: matrix.step == 'check' && matrix.os == 'macos-latest' && always() + - name: Test (macos-latest) + if: matrix.step == 'test' && matrix.os == 'macos-latest' && always() run: | - make verify-cores + make test verify-cores - uses: actions/upload-artifact@v3 - if: matrix.step == 'check' && always() + if: matrix.step == 'test' && always() with: name: emulator-test-frames path: _rendered/*.png diff --git a/.github/workflows/cd/cloudretro.io/config.yaml b/.github/workflows/cd/cloudretro.io/config.yaml index 9cfb0e7b7..fa8b21a59 100644 --- a/.github/workflows/cd/cloudretro.io/config.yaml +++ b/.github/workflows/cd/cloudretro.io/config.yaml @@ -24,14 +24,16 @@ worker: domain: cloudretro.io emulator: - threads: 4 libretro: logLevel: 1 cores: list: mame: options: - "fbneo-cpu-speed-adjust": "200%" "fbneo-diagnostic-input": "Hold Start" + nes: + scale: 2 pcsx: altRepo: true + snes: + scale: 2 diff --git a/Dockerfile b/Dockerfile index 98a8a807b..d874271d5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -51,6 +51,7 @@ RUN apt-get -q update && apt-get -q install --no-install-recommends -y \ libopus-dev \ libsdl2-dev \ libvpx-dev \ + libyuv-dev \ libx264-dev \ pkg-config \ && rm -rf /var/lib/apt/lists/* diff --git a/Makefile b/Makefile index f8097ac01..f0afe6ad7 100644 --- a/Makefile +++ b/Makefile @@ -6,6 +6,8 @@ CGO_CFLAGS='-g -O3 -funroll-loops' CGO_LDFLAGS='-g -O3' GO_TAGS=static +.PHONY: clean test + fmt: @goimports -w cmd pkg tests @gofmt -s -w cmd pkg tests @@ -32,6 +34,9 @@ build.worker: build: build.coordinator build.worker +test: + go test -v ./pkg/... + verify-cores: go test -run TestAll ./pkg/worker/room -v -renderFrames $(GL_CTX) -outputPath "../../../_rendered" diff --git a/README.md b/README.md index 66944fdce..b3f181c31 100644 --- a/README.md +++ b/README.md @@ -61,13 +61,13 @@ a better sense of performance. ``` # Ubuntu / Windows (WSL2) -apt-get install -y make gcc pkg-config libvpx-dev libx264-dev libopus-dev libsdl2-dev +apt-get install -y make gcc pkg-config libvpx-dev libx264-dev libopus-dev libsdl2-dev libyuv-dev # MacOS brew install pkg-config libvpx x264 opus sdl2 # Windows (MSYS2) -pacman -Sy --noconfirm --needed git make mingw-w64-x86_64-{gcc,pkgconf,dlfcn,libvpx,opus,x264-git,SDL2} +pacman -Sy --noconfirm --needed git make mingw-w64-x86_64-{gcc,pkgconf,dlfcn,libvpx,opus,x264-git,SDL2,libyuv} ``` Because the coordinator and workers need to run simultaneously. Workers connect to the coordinator. diff --git a/pkg/config/config.yaml b/pkg/config/config.yaml index 481269ad9..30737b082 100644 --- a/pkg/config/config.yaml +++ b/pkg/config/config.yaml @@ -99,12 +99,9 @@ worker: tag: emulator: - # set output viewport scale factor - scale: 1 - # set the total number of threads for the image processing - # (experimental) - threads: 4 + # (removed) + threads: 0 aspectRatio: # enable aspect ratio changing @@ -163,6 +160,7 @@ emulator: # - altRepo (bool) prioritize secondary repo as the download source # - lib (string) # - roms ([]string) + # - scale (int) scales the output video frames by this factor. # - folder (string) # By default emulator selection is based on the folder named as cores # in the list (i.e. nes, snes) but if you specify folder param, @@ -244,8 +242,6 @@ encoder: video: # h264, vpx (VP8) codec: h264 - # concurrent execution units (0 - disabled) - concurrency: 0 # see: https://trac.ffmpeg.org/wiki/Encode/H.264 h264: # Constant Rate Factor (CRF) 0-51 (default: 23) @@ -273,12 +269,6 @@ encoder: # one additional FFMPEG concat demux file recording: enabled: false - # image compression level: - # 0 - default compression - # -1 - no compression - # -2 - best speed - # -3 - best compression - compressLevel: 0 # name contains the name of the recording dir (or zip) # format: # %date:go_time_format% -- refer: https://go.dev/src/time/format.go diff --git a/pkg/config/emulator.go b/pkg/config/emulator.go index 3b9025188..dda7b4865 100644 --- a/pkg/config/emulator.go +++ b/pkg/config/emulator.go @@ -7,7 +7,6 @@ import ( ) type Emulator struct { - Scale int Threads int AspectRatio struct { Keep bool @@ -54,6 +53,7 @@ type LibretroCoreConfig struct { Lib string Options map[string]string Roms []string + Scale float64 UsesLibCo bool VFR bool Width int diff --git a/pkg/config/shared.go b/pkg/config/shared.go index 856479f4e..026b79d3d 100644 --- a/pkg/config/shared.go +++ b/pkg/config/shared.go @@ -41,11 +41,10 @@ type Server struct { } type Recording struct { - Enabled bool - CompressLevel int - Name string - Folder string - Zip bool + Enabled bool + Name string + Folder string + Zip bool } func (s *Server) WithFlags() { diff --git a/pkg/config/worker.go b/pkg/config/worker.go index ed0145d49..ab6af2cc8 100644 --- a/pkg/config/worker.go +++ b/pkg/config/worker.go @@ -52,9 +52,8 @@ type Audio struct { } type Video struct { - Codec string - Concurrency int - H264 struct { + Codec string + H264 struct { Crf uint8 LogLevel int32 Preset string diff --git a/pkg/encoder/color/bgra/bgra.go b/pkg/encoder/color/bgra/bgra.go new file mode 100644 index 000000000..39a50c228 --- /dev/null +++ b/pkg/encoder/color/bgra/bgra.go @@ -0,0 +1,56 @@ +package bgra + +import ( + "image" + "image/color" +) + +type BGRA struct { + image.RGBA +} + +var BGRAModel = color.ModelFunc(func(c color.Color) color.Color { + if _, ok := c.(BGRAColor); ok { + return c + } + r, g, b, a := c.RGBA() + return BGRAColor{uint8(r >> 8), uint8(g >> 8), uint8(b >> 8), uint8(a >> 8)} +}) + +// BGRAColor represents a BGRA color. +type BGRAColor struct { + R, G, B, A uint8 +} + +func (c BGRAColor) RGBA() (r, g, b, a uint32) { + r = uint32(c.B) + r |= r << 8 + g = uint32(c.G) + g |= g << 8 + b = uint32(c.R) + b |= b << 8 + a = uint32(255) //uint32(c.A) + a |= a << 8 + return +} + +func NewBGRA(r image.Rectangle) *BGRA { + return &BGRA{*image.NewRGBA(r)} +} + +func (p *BGRA) ColorModel() color.Model { return BGRAModel } +func (p *BGRA) At(x, y int) color.Color { + i := p.PixOffset(x, y) + s := p.Pix[i : i+4 : i+4] + return BGRAColor{s[0], s[1], s[2], s[3]} +} + +func (p *BGRA) Set(x, y int, c color.Color) { + i := p.PixOffset(x, y) + c1 := BGRAModel.Convert(c).(BGRAColor) + s := p.Pix[i : i+4 : i+4] + s[0] = c1.R + s[1] = c1.G + s[2] = c1.B + s[3] = 255 +} diff --git a/pkg/encoder/color/rgb565/rgb565.go b/pkg/encoder/color/rgb565/rgb565.go new file mode 100644 index 000000000..11c66c8bf --- /dev/null +++ b/pkg/encoder/color/rgb565/rgb565.go @@ -0,0 +1,62 @@ +package rgb565 + +import ( + "encoding/binary" + "image" + "image/color" + "math" +) + +// RGB565 is an in-memory image whose At method returns RGB565 values. +type RGB565 struct { + // Pix holds the image's pixels, as RGB565 values in big-endian format. The pixel at + // (x, y) starts at Pix[(y-p.Rect.Min.Y)*p.Stride + (x-p.Rect.Min.X)*2]. + Pix []uint8 + // Stride is the Pix stride (in bytes) between vertically adjacent pixels. + Stride int + // Rect is the image's bounds. + Rect image.Rectangle +} + +// Model is the model for RGB565 colors. +var Model = color.ModelFunc(func(c color.Color) color.Color { + //if _, ok := c.(Color); ok { + // return c + //} + r, g, b, _ := c.RGBA() + return Color(uint16((r<<8)&rMask | (g<<3)&gMask | (b>>3)&bMask)) +}) + +const ( + rMask = 0b1111100000000000 + gMask = 0b0000011111100000 + bMask = 0b0000000000011111 +) + +// Color represents an RGB565 color. +type Color uint16 + +func (c Color) RGBA() (r, g, b, a uint32) { + return uint32(math.Round(float64(c&rMask>>11)*255.0/31.0)) << 8, + uint32(math.Round(float64(c&gMask>>5)*255.0/63.0)) << 8, + uint32(math.Round(float64(c&bMask)*255.0/31.0)) << 8, + 0xffff +} + +func NewRGB565(r image.Rectangle) *RGB565 { + return &RGB565{Pix: make([]uint8, r.Dx()*r.Dy()<<1), Stride: r.Dx() << 1, Rect: r} +} + +func (p *RGB565) Bounds() image.Rectangle { return p.Rect } +func (p *RGB565) ColorModel() color.Model { return Model } +func (p *RGB565) PixOffset(x, y int) int { return (x-p.Rect.Min.X)<<1 + (y-p.Rect.Min.Y)*p.Stride } + +func (p *RGB565) At(x, y int) color.Color { + i := p.PixOffset(x, y) + return Color(binary.LittleEndian.Uint16(p.Pix[i : i+2])) +} + +func (p *RGB565) Set(x, y int, c color.Color) { + i := p.PixOffset(x, y) + binary.LittleEndian.PutUint16(p.Pix[i:i+2], uint16(Model.Convert(c).(Color))) +} diff --git a/pkg/encoder/color/rgba/rgba.go b/pkg/encoder/color/rgba/rgba.go new file mode 100644 index 000000000..c37d62181 --- /dev/null +++ b/pkg/encoder/color/rgba/rgba.go @@ -0,0 +1,24 @@ +package rgba + +import ( + "image" + "image/color" +) + +func ToRGBA(img image.Image, flipped bool) *image.RGBA { + bounds := img.Bounds() + sw, sh := bounds.Dx(), bounds.Dy() + dst := image.NewRGBA(image.Rect(0, 0, sw, sh)) + for y := 0; y < sh; y++ { + yy := y + if flipped { + yy = sh - y + } + for x := 0; x < sw; x++ { + px := img.At(x, y) + rgba := color.RGBAModel.Convert(px).(color.RGBA) + dst.Set(x, yy, rgba) + } + } + return dst +} diff --git a/pkg/encoder/encoder.go b/pkg/encoder/encoder.go index 66827d9e0..60e960d01 100644 --- a/pkg/encoder/encoder.go +++ b/pkg/encoder/encoder.go @@ -1,7 +1,7 @@ package encoder import ( - "image" + "fmt" "sync" "sync/atomic" @@ -10,7 +10,7 @@ import ( ) type ( - InFrame *image.RGBA + InFrame yuv.RawFrame OutFrame []byte Encoder interface { LoadBuf(input []byte) @@ -21,11 +21,13 @@ type ( } ) -type VideoEncoder struct { - encoder Encoder +type Video struct { + codec Encoder log *logger.Logger stopped atomic.Bool - y yuv.ImgProcessor + y yuv.Conv + pf yuv.PixFmt + rot uint mu sync.Mutex } @@ -41,39 +43,63 @@ const ( // converts them into YUV I420 format, // encodes with provided video encoder, and // puts the result into the output channel. -func NewVideoEncoder(enc Encoder, w, h int, concurrency int, log *logger.Logger) *VideoEncoder { - y := yuv.NewYuvImgProcessor(w, h, &yuv.Options{Threads: concurrency}) - if concurrency > 0 { - log.Info().Msgf("Use concurrent image processor: %v", concurrency) - } - return &VideoEncoder{encoder: enc, y: y, log: log} +func NewVideoEncoder(codec Encoder, w, h int, scale float64, log *logger.Logger) *Video { + return &Video{codec: codec, y: yuv.NewYuvConv(w, h, scale), log: log} } -func (vp *VideoEncoder) Encode(img InFrame) OutFrame { - vp.mu.Lock() - defer vp.mu.Unlock() - if vp.stopped.Load() { +func (v *Video) Encode(frame InFrame) OutFrame { + v.mu.Lock() + defer v.mu.Unlock() + if v.stopped.Load() { return nil } - yCbCr := vp.y.Process(img) - vp.encoder.LoadBuf(yCbCr) - vp.y.Put(&yCbCr) + yCbCr := v.y.Process(yuv.RawFrame(frame), v.rot, v.pf) + v.codec.LoadBuf(yCbCr) + v.y.Put(&yCbCr) - if frame := vp.encoder.Encode(); len(frame) > 0 { - return frame + if bytes := v.codec.Encode(); len(bytes) > 0 { + return bytes } return nil } -func (vp *VideoEncoder) SetFlip(b bool) { vp.encoder.SetFlip(b) } +func (v *Video) Info() string { return fmt.Sprintf("libyuv: %v", v.y.Version()) } + +func (v *Video) SetPixFormat(f uint32) { + switch f { + case 1: + v.pf = yuv.PixFmt(yuv.FourccArgb) + case 2: + v.pf = yuv.PixFmt(yuv.FourccRgbp) + default: + v.pf = yuv.PixFmt(yuv.FourccAbgr) + } +} + +// SetRot sets the rotation angle of the frames. +func (v *Video) SetRot(r uint) { + switch r { + // de-rotate + case 90: + v.rot = 270 + case 270: + v.rot = 90 + default: + v.rot = r + } +} + +// SetFlip tells the encoder to flip the frames vertically. +func (v *Video) SetFlip(b bool) { v.codec.SetFlip(b) } -func (vp *VideoEncoder) Stop() { - vp.stopped.Store(true) - vp.mu.Lock() - defer vp.mu.Unlock() +func (v *Video) Stop() { + v.stopped.Store(true) + v.mu.Lock() + defer v.mu.Unlock() + v.rot = 0 - if err := vp.encoder.Shutdown(); err != nil { - vp.log.Error().Err(err).Msg("failed to close the encoder") + if err := v.codec.Shutdown(); err != nil { + v.log.Error().Err(err).Msg("failed to close the encoder") } } diff --git a/pkg/encoder/yuv/libyuv/LICENSE b/pkg/encoder/yuv/libyuv/LICENSE new file mode 100644 index 000000000..c911747a6 --- /dev/null +++ b/pkg/encoder/yuv/libyuv/LICENSE @@ -0,0 +1,29 @@ +Copyright 2011 The LibYuv Project Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + * Neither the name of Google nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/pkg/encoder/yuv/libyuv/basic_types.h b/pkg/encoder/yuv/libyuv/basic_types.h new file mode 100644 index 000000000..9c66a132a --- /dev/null +++ b/pkg/encoder/yuv/libyuv/basic_types.h @@ -0,0 +1,29 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_ +#define INCLUDE_LIBYUV_BASIC_TYPES_H_ + +#include // For size_t and NULL + +#if !defined(INT_TYPES_DEFINED) && !defined(GG_LONGLONG) +#define INT_TYPES_DEFINED + +#include // for uintptr_t and C99 types + +#endif // INT_TYPES_DEFINED + +#if !defined(LIBYUV_API) +#define LIBYUV_API +#endif // LIBYUV_API + +#define LIBYUV_BOOL int + +#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_ diff --git a/pkg/encoder/yuv/libyuv/convert.c b/pkg/encoder/yuv/libyuv/convert.c new file mode 100644 index 000000000..c59da3b1b --- /dev/null +++ b/pkg/encoder/yuv/libyuv/convert.c @@ -0,0 +1,336 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "convert.h" + +#include "basic_types.h" +#include "cpu_id.h" +#include "planar_functions.h" +#include "row.h" + +// Subsample amount uses a shift. +// v is value +// a is amount to add to round up +// s is shift to subsample down +#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) + +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +// Copy I420 with optional flipping. +// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure +// is does row coalescing. +LIBYUV_API +int I420Copy(const uint8_t *src_y, + int src_stride_y, + const uint8_t *src_u, + int src_stride_u, + const uint8_t *src_v, + int src_stride_v, + uint8_t *dst_y, + int dst_stride_y, + uint8_t *dst_u, + int dst_stride_u, + uint8_t *dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + // Copy UV planes. + CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); + CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); + return 0; +} + +// Convert ARGB to I420. +LIBYUV_API +int ARGBToI420(const uint8_t *src_argb, + int src_stride_argb, + uint8_t *dst_y, + int dst_stride_y, + uint8_t *dst_u, + int dst_stride_u, + uint8_t *dst_v, + int dst_stride_v, + int width, + int height) { + int y; + void (*ARGBToUVRow)(const uint8_t *src_argb0, int src_stride_argb, + uint8_t *dst_u, uint8_t *dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t *src_argb, uint8_t *dst_y, int width) = + ARGBToYRow_C; + if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); + src_argb += src_stride_argb * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + } + return 0; +} + +// Convert ABGR to I420. +LIBYUV_API +int ABGRToI420(const uint8_t *src_abgr, + int src_stride_abgr, + uint8_t *dst_y, + int dst_stride_y, + uint8_t *dst_u, + int dst_stride_u, + uint8_t *dst_v, + int dst_stride_v, + int width, + int height) { + int y; + void (*ABGRToUVRow)(const uint8_t *src_abgr0, int src_stride_abgr, + uint8_t *dst_u, uint8_t *dst_v, int width) = + ABGRToUVRow_C; + void (*ABGRToYRow)(const uint8_t *src_abgr, uint8_t *dst_y, int width) = + ABGRToYRow_C; + if (!src_abgr || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_abgr = src_abgr + (height - 1) * src_stride_abgr; + src_stride_abgr = -src_stride_abgr; + } +#if defined(HAS_ABGRTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToYRow = ABGRToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToYRow = ABGRToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToUVRow = ABGRToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToYRow = ABGRToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ABGRToYRow = ABGRToYRow_AVX2; + } + } +#endif +#if defined(HAS_ABGRTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToUVRow = ABGRToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ABGRToUVRow = ABGRToUVRow_AVX2; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width); + ABGRToYRow(src_abgr, dst_y, width); + ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width); + src_abgr += src_stride_abgr * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width); + ABGRToYRow(src_abgr, dst_y, width); + } + return 0; +} + +// Convert RGB565 to I420. +LIBYUV_API +int RGB565ToI420(const uint8_t *src_rgb565, + int src_stride_rgb565, + uint8_t *dst_y, + int dst_stride_y, + uint8_t *dst_u, + int dst_stride_u, + uint8_t *dst_v, + int dst_stride_v, + int width, + int height) { + int y; + void (*RGB565ToARGBRow)(const uint8_t *src_rgb, uint8_t *dst_argb, + int width) = RGB565ToARGBRow_C; + void (*ARGBToUVRow)(const uint8_t *src_argb0, int src_stride_argb, + uint8_t *dst_u, uint8_t *dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t *src_argb, uint8_t *dst_y, int width) = + ARGBToYRow_C; + if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565; + src_stride_rgb565 = -src_stride_rgb565; + } + +#if defined(HAS_RGB565TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + RGB565ToARGBRow = RGB565ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_RGB565TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + RGB565ToARGBRow = RGB565ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + } + } +#endif + { +#if !(defined(HAS_RGB565TOYROW_NEON)) + // Allocate 2 rows of ARGB. + const int row_size = (width * 4 + 31) & ~31; + align_buffer_64(row, row_size * 2); +#endif + for (y = 0; y < height - 1; y += 2) { +#if (defined(HAS_RGB565TOYROW_NEON)) +#else + RGB565ToARGBRow(src_rgb565, row, width); + RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + row_size, width); + ARGBToUVRow(row, row_size, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + row_size, dst_y + dst_stride_y, width); +#endif + src_rgb565 += src_stride_rgb565 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if (defined(HAS_RGB565TOYROW_NEON)) +#else + RGB565ToARGBRow(src_rgb565, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); +#endif + } +#if !(defined(HAS_RGB565TOYROW_NEON)) + free_aligned_buffer_64(row); +#endif + } + return 0; +} diff --git a/pkg/encoder/yuv/libyuv/convert.h b/pkg/encoder/yuv/libyuv/convert.h new file mode 100644 index 000000000..9a81c509c --- /dev/null +++ b/pkg/encoder/yuv/libyuv/convert.h @@ -0,0 +1,113 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_CONVERT_H_ +#define INCLUDE_LIBYUV_CONVERT_H_ + +#include "rotate.h" // For enum RotationMode. + +// Copy I420 to I420. +#define I420ToI420 I420Copy +LIBYUV_API +int I420Copy(const uint8_t *src_y, + int src_stride_y, + const uint8_t *src_u, + int src_stride_u, + const uint8_t *src_v, + int src_stride_v, + uint8_t *dst_y, + int dst_stride_y, + uint8_t *dst_u, + int dst_stride_u, + uint8_t *dst_v, + int dst_stride_v, + int width, + int height); + +// ARGB little endian (bgra in memory) to I420. +LIBYUV_API +int ARGBToI420(const uint8_t *src_argb, + int src_stride_argb, + uint8_t *dst_y, + int dst_stride_y, + uint8_t *dst_u, + int dst_stride_u, + uint8_t *dst_v, + int dst_stride_v, + int width, + int height); + +// ABGR little endian (rgba in memory) to I420. +LIBYUV_API +int ABGRToI420(const uint8_t *src_abgr, + int src_stride_abgr, + uint8_t *dst_y, + int dst_stride_y, + uint8_t *dst_u, + int dst_stride_u, + uint8_t *dst_v, + int dst_stride_v, + int width, + int height); + +// RGB16 (RGBP fourcc) little endian to I420. +LIBYUV_API +int RGB565ToI420(const uint8_t *src_rgb565, + int src_stride_rgb565, + uint8_t *dst_y, + int dst_stride_y, + uint8_t *dst_u, + int dst_stride_u, + uint8_t *dst_v, + int dst_stride_v, + int width, + int height); + +// Convert camera sample to I420 with cropping, rotation and vertical flip. +// "src_size" is needed to parse MJPG. +// "dst_stride_y" number of bytes in a row of the dst_y plane. +// Normally this would be the same as dst_width, with recommended alignment +// to 16 bytes for better efficiency. +// If rotation of 90 or 270 is used, stride is affected. The caller should +// allocate the I420 buffer according to rotation. +// "dst_stride_u" number of bytes in a row of the dst_u plane. +// Normally this would be the same as (dst_width + 1) / 2, with +// recommended alignment to 16 bytes for better efficiency. +// If rotation of 90 or 270 is used, stride is affected. +// "crop_x" and "crop_y" are starting position for cropping. +// To center, crop_x = (src_width - dst_width) / 2 +// crop_y = (src_height - dst_height) / 2 +// "src_width" / "src_height" is size of src_frame in pixels. +// "src_height" can be negative indicating a vertically flipped image source. +// "crop_width" / "crop_height" is the size to crop the src to. +// Must be less than or equal to src_width/src_height +// Cropping parameters are pre-rotation. +// "rotation" can be 0, 90, 180 or 270. +// "fourcc" is a fourcc. ie 'I420', 'YUY2' +// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure. +LIBYUV_API +int ConvertToI420(const uint8_t *sample, + size_t sample_size, + uint8_t *dst_y, + int dst_stride_y, + uint8_t *dst_u, + int dst_stride_u, + uint8_t *dst_v, + int dst_stride_v, + int crop_x, + int crop_y, + int src_width, + int src_height, + int crop_width, + int crop_height, + enum RotationMode rotation, + uint32_t fourcc); + +#endif // INCLUDE_LIBYUV_CONVERT_H_ \ No newline at end of file diff --git a/pkg/encoder/yuv/libyuv/convert_argb.h b/pkg/encoder/yuv/libyuv/convert_argb.h new file mode 100644 index 000000000..ac8e97169 --- /dev/null +++ b/pkg/encoder/yuv/libyuv/convert_argb.h @@ -0,0 +1,24 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_ +#define INCLUDE_LIBYUV_CONVERT_ARGB_H_ + +#include "basic_types.h" + +// Conversion matrix for YVU to BGR +LIBYUV_API extern const struct YuvConstants kYvuI601Constants; // BT.601 +LIBYUV_API extern const struct YuvConstants kYvuJPEGConstants; // BT.601 full +LIBYUV_API extern const struct YuvConstants kYvuH709Constants; // BT.709 +LIBYUV_API extern const struct YuvConstants kYvuF709Constants; // BT.709 full +LIBYUV_API extern const struct YuvConstants kYvu2020Constants; // BT.2020 +LIBYUV_API extern const struct YuvConstants kYvuV2020Constants; // BT.2020 full + +#endif // INCLUDE_LIBYUV_CONVERT_ARGB_H_ diff --git a/pkg/encoder/yuv/libyuv/convert_to_i420.c b/pkg/encoder/yuv/libyuv/convert_to_i420.c new file mode 100644 index 000000000..848021427 --- /dev/null +++ b/pkg/encoder/yuv/libyuv/convert_to_i420.c @@ -0,0 +1,116 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "convert.h" +#include "video_common.h" + +// Convert camera sample to I420 with cropping, rotation and vertical flip. +// src_width is used for source stride computation +// src_height is used to compute location of planes, and indicate inversion +// sample_size is measured in bytes and is the size of the frame. +// With MJPEG it is the compressed size of the frame. +LIBYUV_API +int ConvertToI420(const uint8_t *sample, + size_t sample_size, + uint8_t *dst_y, + int dst_stride_y, + uint8_t *dst_u, + int dst_stride_u, + uint8_t *dst_v, + int dst_stride_v, + int crop_x, + int crop_y, + int src_width, + int src_height, + int crop_width, + int crop_height, + enum RotationMode rotation, + uint32_t fourcc) { + uint32_t format = CanonicalFourCC(fourcc); + const uint8_t *src; + // TODO(nisse): Why allow crop_height < 0? + const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; + int r = 0; + LIBYUV_BOOL need_buf = + (rotation && format != FOURCC_I420 && format != FOURCC_NV12 && + format != FOURCC_NV21 && format != FOURCC_YV12) || + dst_y == sample; + uint8_t *tmp_y = dst_y; + uint8_t *tmp_u = dst_u; + uint8_t *tmp_v = dst_v; + int tmp_y_stride = dst_stride_y; + int tmp_u_stride = dst_stride_u; + int tmp_v_stride = dst_stride_v; + uint8_t *rotate_buffer = NULL; + const int inv_crop_height = + (src_height < 0) ? -abs_crop_height : abs_crop_height; + + if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 || + crop_width <= 0 || src_height == 0 || crop_height == 0) { + return -1; + } + + // One pass rotation is available for some formats. For the rest, convert + // to I420 (with optional vertical flipping) into a temporary I420 buffer, + // and then rotate the I420 to the final destination buffer. + // For in-place conversion, if destination dst_y is same as source sample, + // also enable temporary buffer. + if (need_buf) { + int y_size = crop_width * abs_crop_height; + int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2); + rotate_buffer = (uint8_t *) malloc(y_size + uv_size * 2); /* NOLINT */ + if (!rotate_buffer) { + return 1; // Out of memory runtime error. + } + dst_y = rotate_buffer; + dst_u = dst_y + y_size; + dst_v = dst_u + uv_size; + dst_stride_y = crop_width; + dst_stride_u = dst_stride_v = ((crop_width + 1) / 2); + } + + switch (format) { + // Single plane formats + case FOURCC_RGBP: + src = sample + (src_width * crop_y + crop_x) * 2; + r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); + break; + case FOURCC_ARGB: + src = sample + (src_width * crop_y + crop_x) * 4; + r = ARGBToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); + break; + case FOURCC_ABGR: + src = sample + (src_width * crop_y + crop_x) * 4; + r = ABGRToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); + break; + default: + r = -1; // unknown fourcc - return failure code. + } + + if (need_buf) { + if (!r) { + r = I420Rotate(dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, tmp_y, tmp_y_stride, tmp_u, tmp_u_stride, + tmp_v, tmp_v_stride, crop_width, abs_crop_height, + rotation); + } + free(rotate_buffer); + } + + return r; +} diff --git a/pkg/encoder/yuv/libyuv/cpu_id.c b/pkg/encoder/yuv/libyuv/cpu_id.c new file mode 100644 index 000000000..166057de5 --- /dev/null +++ b/pkg/encoder/yuv/libyuv/cpu_id.c @@ -0,0 +1,204 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "cpu_id.h" + +#if !defined(__pnacl__) && !defined(__CLR_VER) && \ + !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \ + defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) +#include // For _xgetbv() +#endif + +// For ArmCpuCaps() but unittested on all platforms +#include // For fopen() +#include + +// For functions that use the stack and have runtime checks for overflow, +// use SAFEBUFFERS to avoid additional check. +#define SAFEBUFFERS + +// cpu_info_ variable for SIMD instruction sets detected. +LIBYUV_API int cpu_info_ = 0; + +// Low level cpuid for X86. +#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ + defined(__x86_64__)) && \ + !defined(__pnacl__) && !defined(__CLR_VER) +LIBYUV_API +void CpuId(int info_eax, int info_ecx, int *cpu_info) { +#if defined(_MSC_VER) + // GCC version uses inline x86 assembly. +#else // defined(_MSC_VER) + int info_ebx, info_edx; + asm volatile( +#if defined(__i386__) && defined(__PIC__) + // Preserve ebx for fpic 32 bit. + "mov %%ebx, %%edi \n" + "cpuid \n" + "xchg %%edi, %%ebx \n" + : "=D"(info_ebx), +#else + "cpuid \n" + : "=b"(info_ebx), +#endif // defined( __i386__) && defined(__PIC__) + "+a"(info_eax), "+c"(info_ecx), "=d"(info_edx)); + cpu_info[0] = info_eax; + cpu_info[1] = info_ebx; + cpu_info[2] = info_ecx; + cpu_info[3] = info_edx; +#endif // defined(_MSC_VER) +} + +#else // (defined(_M_IX86) || defined(_M_X64) ... +LIBYUV_API +void CpuId(int eax, int ecx, int* cpu_info) { + (void)eax; + (void)ecx; + cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; +} +#endif + +// For VS2010 and earlier emit can be used: +// _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier. +// __asm { +// xor ecx, ecx // xcr 0 +// xgetbv +// mov xcr0, eax +// } +// For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code. +// https://code.google.com/p/libyuv/issues/detail?id=529 +#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900) +#pragma optimize("g", off) +#endif +#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ + defined(__x86_64__)) && \ + !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__) + +// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers. +static int GetXCR0() { + int xcr0 = 0; +#if defined(__i386__) || defined(__x86_64__) + asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx"); +#endif // defined(__i386__) || defined(__x86_64__) + return xcr0; +} + +#else +// xgetbv unavailable to query for OSSave support. Return 0. +#define GetXCR0() 0 +#endif // defined(_M_IX86) || defined(_M_X64) .. +// Return optimization to previous setting. +#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900) +#pragma optimize("g", on) +#endif + +// Based on libvpx arm_cpudetect.c +// For Arm, but public to allow testing on any CPU +LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char *cpuinfo_name) { + char cpuinfo_line[512]; + FILE *f = fopen(cpuinfo_name, "re"); + if (!f) { + // Assume Neon if /proc/cpuinfo is unavailable. + // This will occur for Chrome sandbox for Pepper or Render process. + return kCpuHasNEON; + } + memset(cpuinfo_line, 0, sizeof(cpuinfo_line)); + while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f)) { + if (memcmp(cpuinfo_line, "Features", 8) == 0) { + char *p = strstr(cpuinfo_line, " neon"); + if (p && (p[5] == ' ' || p[5] == '\n')) { + fclose(f); + return kCpuHasNEON; + } + // aarch64 uses asimd for Neon. + p = strstr(cpuinfo_line, " asimd"); + if (p) { + fclose(f); + return kCpuHasNEON; + } + } + } + fclose(f); + return 0; +} + +static SAFEBUFFERS int GetCpuFlags(void) { + int cpu_info = 0; +#if !defined(__pnacl__) && !defined(__CLR_VER) && \ + (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \ + defined(_M_IX86)) + int cpu_info0[4] = {0, 0, 0, 0}; + int cpu_info1[4] = {0, 0, 0, 0}; + int cpu_info7[4] = {0, 0, 0, 0}; + CpuId(0, 0, cpu_info0); + CpuId(1, 0, cpu_info1); + if (cpu_info0[0] >= 7) { + CpuId(7, 0, cpu_info7); + } + cpu_info = kCpuHasX86 | ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) | + ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) | + ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) | + ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) | + ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0); + + // AVX requires OS saves YMM registers. + if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) && // AVX and OSXSave + ((GetXCR0() & 6) == 6)) { // Test OS saves YMM registers + cpu_info |= kCpuHasAVX | ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | + ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) | + ((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0); + + // Detect AVX512bw + if ((GetXCR0() & 0xe0) == 0xe0) { + cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0; + cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0; + cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0; + cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0; + cpu_info |= (cpu_info7[2] & 0x00000800) ? kCpuHasAVX512VNNI : 0; + cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0; + cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0; + cpu_info |= (cpu_info7[2] & 0x00000100) ? kCpuHasGFNI : 0; + } + } +#endif +#if defined(__arm__) || defined(__aarch64__) + // gcc -mfpu=neon defines __ARM_NEON__ + // __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon. + // For Linux, /proc/cpuinfo can be tested but without that assume Neon. +#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__) + cpu_info = kCpuHasNEON; + // For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon + // flag in it. + // So for aarch64, neon enabling is hard coded here. +#endif +#if defined(__aarch64__) + cpu_info = kCpuHasNEON; +#else + // Linux arm parse text file for neon detect. + cpu_info = ArmCpuCaps("/proc/cpuinfo"); +#endif + cpu_info |= kCpuHasARM; +#endif // __arm__ + cpu_info |= kCpuInitialized; + return cpu_info; +} + +// Note that use of this function is not thread safe. +LIBYUV_API +int MaskCpuFlags(int enable_flags) { + int cpu_info = GetCpuFlags() & enable_flags; + SetCpuFlags(cpu_info); + return cpu_info; +} + +LIBYUV_API +int InitCpuFlags(void) { + return MaskCpuFlags(-1); +} diff --git a/pkg/encoder/yuv/libyuv/cpu_id.h b/pkg/encoder/yuv/libyuv/cpu_id.h new file mode 100644 index 000000000..bf50b9cd1 --- /dev/null +++ b/pkg/encoder/yuv/libyuv/cpu_id.h @@ -0,0 +1,106 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_CPU_ID_H_ +#define INCLUDE_LIBYUV_CPU_ID_H_ + +#include "basic_types.h" + +// Internal flag to indicate cpuid requires initialization. +static const int kCpuInitialized = 0x1; + +// These flags are only valid on ARM processors. +static const int kCpuHasARM = 0x2; +static const int kCpuHasNEON = 0x4; +// 0x8 reserved for future ARM flag. + +// These flags are only valid on x86 processors. +static const int kCpuHasX86 = 0x10; +static const int kCpuHasSSE2 = 0x20; +static const int kCpuHasSSSE3 = 0x40; +static const int kCpuHasSSE41 = 0x80; +static const int kCpuHasSSE42 = 0x100; // unused at this time. +static const int kCpuHasAVX = 0x200; +static const int kCpuHasAVX2 = 0x400; +static const int kCpuHasERMS = 0x800; +static const int kCpuHasFMA3 = 0x1000; +static const int kCpuHasF16C = 0x2000; +static const int kCpuHasGFNI = 0x4000; +static const int kCpuHasAVX512BW = 0x8000; +static const int kCpuHasAVX512VL = 0x10000; +static const int kCpuHasAVX512VNNI = 0x20000; +static const int kCpuHasAVX512VBMI = 0x40000; +static const int kCpuHasAVX512VBMI2 = 0x80000; +static const int kCpuHasAVX512VBITALG = 0x100000; +static const int kCpuHasAVX512VPOPCNTDQ = 0x200000; + +// Optional init function. TestCpuFlag does an auto-init. +// Returns cpu_info flags. +LIBYUV_API +int InitCpuFlags(void); + +// Detect CPU has SSE2 etc. +// Test_flag parameter should be one of kCpuHas constants above. +// Returns non-zero if instruction set is detected +static __inline int TestCpuFlag(int test_flag) { + LIBYUV_API extern int cpu_info_; +#ifdef __ATOMIC_RELAXED + int cpu_info = __atomic_load_n(&cpu_info_, __ATOMIC_RELAXED); +#else + int cpu_info = cpu_info_; +#endif + return (!cpu_info ? InitCpuFlags() : cpu_info) & test_flag; +} + +// Internal function for parsing /proc/cpuinfo. +LIBYUV_API +int ArmCpuCaps(const char *cpuinfo_name); + +// For testing, allow CPU flags to be disabled. +// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3. +// MaskCpuFlags(-1) to enable all cpu specific optimizations. +// MaskCpuFlags(1) to disable all cpu specific optimizations. +// MaskCpuFlags(0) to reset state so next call will auto init. +// Returns cpu_info flags. +LIBYUV_API +int MaskCpuFlags(int enable_flags); + +// Sets the CPU flags to |cpu_flags|, bypassing the detection code. |cpu_flags| +// should be a valid combination of the kCpuHas constants above and include +// kCpuInitialized. Use this method when running in a sandboxed process where +// the detection code might fail (as it might access /proc/cpuinfo). In such +// cases the cpu_info can be obtained from a non sandboxed process by calling +// InitCpuFlags() and passed to the sandboxed process (via command line +// parameters, IPC...) which can then call this method to initialize the CPU +// flags. +// Notes: +// - when specifying 0 for |cpu_flags|, the auto initialization is enabled +// again. +// - enabling CPU features that are not supported by the CPU will result in +// undefined behavior. +// TODO(fbarchard): consider writing a helper function that translates from +// other library CPU info to libyuv CPU info and add a .md doc that explains +// CPU detection. +static __inline void SetCpuFlags(int cpu_flags) { + LIBYUV_API extern int cpu_info_; +#ifdef __ATOMIC_RELAXED + __atomic_store_n(&cpu_info_, cpu_flags, __ATOMIC_RELAXED); +#else + cpu_info_ = cpu_flags; +#endif +} + +// Low level cpuid for X86. Returns zeros on other CPUs. +// eax is the info type that you want. +// ecx is typically the cpu number, and should normally be zero. +LIBYUV_API +void CpuId(int info_eax, int info_ecx, int *cpu_info); + +#endif // INCLUDE_LIBYUV_CPU_ID_H_ \ No newline at end of file diff --git a/pkg/encoder/yuv/libyuv/libyuv.go b/pkg/encoder/yuv/libyuv/libyuv.go new file mode 100644 index 000000000..98d4276ff --- /dev/null +++ b/pkg/encoder/yuv/libyuv/libyuv.go @@ -0,0 +1,142 @@ +//go:build !darwin && !no_libyuv + +package libyuv + +// see: https://chromium.googlesource.com/libyuv/libyuv + +/* +#cgo CFLAGS: -Wall +#cgo LDFLAGS: -lyuv + +#include +#include "libyuv/version.h" +#include "libyuv/video_common.h" + +// +typedef enum RotationMode { + kRotate0 = 0, // No rotation. + kRotate90 = 90, // Rotate 90 degrees clockwise. + kRotate180 = 180, // Rotate 180 degrees. + kRotate270 = 270, // Rotate 270 degrees clockwise. +} RotationModeEnum; + +// +LIBYUV_API +int ConvertToI420(const uint8_t* sample, + size_t sample_size, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int crop_x, + int crop_y, + int src_width, + int src_height, + int crop_width, + int crop_height, + enum RotationMode rotation, + uint32_t fourcc); + +// Supported filtering. +typedef enum FilterMode { + kFilterNone = 0, // Point sample; Fastest. + kFilterLinear = 1, // Filter horizontally only. + kFilterBilinear = 2, // Faster than box, but lower quality scaling down. + kFilterBox = 3 // Highest quality. +} FilterModeEnum; + +LIBYUV_API +int I420Scale(const uint8_t *src_y, + int src_stride_y, + const uint8_t *src_u, + int src_stride_u, + const uint8_t *src_v, + int src_stride_v, + int src_width, + int src_height, + uint8_t *dst_y, + int dst_stride_y, + uint8_t *dst_u, + int dst_stride_u, + uint8_t *dst_v, + int dst_stride_v, + int dst_width, + int dst_height, + enum FilterMode filtering); +*/ +import "C" +import "fmt" + +const FourccRgbp uint32 = C.FOURCC_RGBP +const FourccArgb uint32 = C.FOURCC_ARGB +const FourccAbgr uint32 = C.FOURCC_ABGR + +func Y420(src []byte, dst []byte, _, h, stride int, dw, dh int, rot uint, pix uint32, cx, cy int) { + cw := (dw + 1) / 2 + ch := (dh + 1) / 2 + i0 := dw * dh + i1 := i0 + cw*ch + yStride := dw + cStride := cw + + C.ConvertToI420( + (*C.uchar)(&src[0]), + C.size_t(0), + (*C.uchar)(&dst[0]), + C.int(yStride), + (*C.uchar)(&dst[i0]), + C.int(cStride), + (*C.uchar)(&dst[i1]), + C.int(cStride), + C.int(0), + C.int(0), + C.int(stride), + C.int(h), + C.int(cx), + C.int(cy), + C.enum_RotationMode(rot), + C.uint32_t(pix)) +} + +func Y420Scale(src []byte, dst []byte, w, h int, dw, dh int) { + srcWidthUV, dstWidthUV := (w+1)>>1, (dw+1)>>1 + srcHeightUV, dstHeightUV := (h+1)>>1, (dh+1)>>1 + + srcYPlaneSize, dstYPlaneSize := w*h, dw*dh + srcUVPlaneSize, dstUVPlaneSize := srcWidthUV*srcHeightUV, dstWidthUV*dstHeightUV + + srcStrideY, dstStrideY := w, dw + srcStrideU, dstStrideU := srcWidthUV, dstWidthUV + srcStrideV, dstStrideV := srcWidthUV, dstWidthUV + + srcY := (*C.uchar)(&src[0]) + srcU := (*C.uchar)(&src[srcYPlaneSize]) + srcV := (*C.uchar)(&src[srcYPlaneSize+srcUVPlaneSize]) + + dstY := (*C.uchar)(&dst[0]) + dstU := (*C.uchar)(&dst[dstYPlaneSize]) + dstV := (*C.uchar)(&dst[dstYPlaneSize+dstUVPlaneSize]) + + C.I420Scale( + srcY, + C.int(srcStrideY), + srcU, + C.int(srcStrideU), + srcV, + C.int(srcStrideV), + C.int(w), + C.int(h), + dstY, + C.int(dstStrideY), + dstU, + C.int(dstStrideU), + dstV, + C.int(dstStrideV), + C.int(dw), + C.int(dh), + C.enum_FilterMode(C.kFilterNone)) +} + +func Version() string { return fmt.Sprintf("%v", int(C.LIBYUV_VERSION)) } diff --git a/pkg/encoder/yuv/libyuv/libyuv2.go b/pkg/encoder/yuv/libyuv/libyuv2.go new file mode 100644 index 000000000..f4f6a68b5 --- /dev/null +++ b/pkg/encoder/yuv/libyuv/libyuv2.go @@ -0,0 +1,89 @@ +//go:build darwin || no_libyuv + +package libyuv + +/* +#cgo CFLAGS: -Wall + +#include "basic_types.h" +#include "version.h" +#include "video_common.h" +#include "rotate.h" +#include "scale.h" +#include "convert.h" + +*/ +import "C" +import "fmt" + +const FourccRgbp uint32 = C.FOURCC_RGBP +const FourccArgb uint32 = C.FOURCC_ARGB +const FourccAbgr uint32 = C.FOURCC_ABGR + +func Y420(src []byte, dst []byte, _, h, stride int, dw, dh int, rot uint, pix uint32, cx, cy int) { + cw := (dw + 1) / 2 + ch := (dh + 1) / 2 + i0 := dw * dh + i1 := i0 + cw*ch + yStride := dw + cStride := cw + + C.ConvertToI420( + (*C.uchar)(&src[0]), + C.size_t(0), + (*C.uchar)(&dst[0]), + C.int(yStride), + (*C.uchar)(&dst[i0]), + C.int(cStride), + (*C.uchar)(&dst[i1]), + C.int(cStride), + C.int(0), + C.int(0), + C.int(stride), + C.int(h), + C.int(cx), + C.int(cy), + C.enum_RotationMode(rot), + C.uint32_t(pix)) +} + +func Y420Scale(src []byte, dst []byte, w, h int, dw, dh int) { + srcWidthUV, dstWidthUV := (w+1)>>1, (dw+1)>>1 + srcHeightUV, dstHeightUV := (h+1)>>1, (dh+1)>>1 + + srcYPlaneSize, dstYPlaneSize := w*h, dw*dh + srcUVPlaneSize, dstUVPlaneSize := srcWidthUV*srcHeightUV, dstWidthUV*dstHeightUV + + srcStrideY, dstStrideY := w, dw + srcStrideU, dstStrideU := srcWidthUV, dstWidthUV + srcStrideV, dstStrideV := srcWidthUV, dstWidthUV + + srcY := (*C.uchar)(&src[0]) + srcU := (*C.uchar)(&src[srcYPlaneSize]) + srcV := (*C.uchar)(&src[srcYPlaneSize+srcUVPlaneSize]) + + dstY := (*C.uchar)(&dst[0]) + dstU := (*C.uchar)(&dst[dstYPlaneSize]) + dstV := (*C.uchar)(&dst[dstYPlaneSize+dstUVPlaneSize]) + + C.I420Scale( + srcY, + C.int(srcStrideY), + srcU, + C.int(srcStrideU), + srcV, + C.int(srcStrideV), + C.int(w), + C.int(h), + dstY, + C.int(dstStrideY), + dstU, + C.int(dstStrideU), + dstV, + C.int(dstStrideV), + C.int(dw), + C.int(dh), + C.enum_FilterMode(C.kFilterNone)) +} + +func Version() string { return fmt.Sprintf("%v mod", int(C.LIBYUV_VERSION)) } diff --git a/pkg/encoder/yuv/libyuv/planar_functions.c b/pkg/encoder/yuv/libyuv/planar_functions.c new file mode 100644 index 000000000..a5d543cc5 --- /dev/null +++ b/pkg/encoder/yuv/libyuv/planar_functions.c @@ -0,0 +1,68 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "planar_functions.h" + +#include "cpu_id.h" +#include "row.h" + +// Copy a plane of data +LIBYUV_API +void CopyPlane(const uint8_t *src_y, + int src_stride_y, + uint8_t *dst_y, + int dst_stride_y, + int width, + int height) { + int y; + void (*CopyRow)(const uint8_t *src, uint8_t *dst, int width) = CopyRow_C; + if (width <= 0 || height == 0) { + return; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } + // Nothing to do. + if (src_y == dst_y && src_stride_y == dst_stride_y) { + return; + } + +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; + } +#endif +#if defined(HAS_COPYROW_AVX) + if (TestCpuFlag(kCpuHasAVX)) { + CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; + } +#endif +#if defined(HAS_COPYROW_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + CopyRow = CopyRow_ERMS; + } +#endif + + // Copy plane + for (y = 0; y < height; ++y) { + CopyRow(src_y, dst_y, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} diff --git a/pkg/encoder/yuv/libyuv/planar_functions.h b/pkg/encoder/yuv/libyuv/planar_functions.h new file mode 100644 index 000000000..222109cfc --- /dev/null +++ b/pkg/encoder/yuv/libyuv/planar_functions.h @@ -0,0 +1,46 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ +#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ + +#include "basic_types.h" + +// TODO(fbarchard): Move cpu macros to row.h +#if defined(__pnacl__) || defined(__CLR_VER) || \ + (defined(__native_client__) && defined(__x86_64__)) || \ + (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) +#define LIBYUV_DISABLE_X86 +#endif +// MemorySanitizer does not support assembly code yet. http://crbug.com/344505 +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON) +#define LIBYUV_DISABLE_NEON +#endif +#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86) +#define LIBYUV_DISABLE_X86 +#endif +#endif +// The following are available on all x86 platforms: +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +#define HAS_ARGBAFFINEROW_SSE2 +#endif + +// Copy a plane of data. +LIBYUV_API +void CopyPlane(const uint8_t *src_y, + int src_stride_y, + uint8_t *dst_y, + int dst_stride_y, + int width, + int height); + +#endif // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ \ No newline at end of file diff --git a/pkg/encoder/yuv/libyuv/rotate.c b/pkg/encoder/yuv/libyuv/rotate.c new file mode 100644 index 000000000..4aabae5b0 --- /dev/null +++ b/pkg/encoder/yuv/libyuv/rotate.c @@ -0,0 +1,217 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "rotate.h" + +#include "convert.h" +#include "cpu_id.h" +#include "rotate_row.h" +#include "row.h" + +LIBYUV_API +void TransposePlane(const uint8_t *src, + int src_stride, + uint8_t *dst, + int dst_stride, + int width, + int height) { + int i = height; + + void (*TransposeWx8)(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int width) = TransposeWx8_C; + +#if defined(HAS_TRANSPOSEWX8_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + TransposeWx8 = TransposeWx8_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + TransposeWx8 = TransposeWx8_SSSE3; + } + } +#endif +#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + TransposeWx8 = TransposeWx8_Fast_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + TransposeWx8 = TransposeWx8_Fast_SSSE3; + } + } +#endif + + // Work across the source in 8x8 tiles + while (i >= 8) { + TransposeWx8(src, src_stride, dst, dst_stride, width); + src += 8 * src_stride; // Go down 8 rows. + dst += 8; // Move over 8 columns. + i -= 8; + } + + if (i > 0) { + TransposeWxH_C(src, src_stride, dst, dst_stride, width, i); + } +} + +LIBYUV_API +void RotatePlane90(const uint8_t *src, + int src_stride, + uint8_t *dst, + int dst_stride, + int width, + int height) { + // Rotate by 90 is a transpose with the source read + // from bottom to top. So set the source pointer to the end + // of the buffer and flip the sign of the source stride. + src += src_stride * (height - 1); + src_stride = -src_stride; + TransposePlane(src, src_stride, dst, dst_stride, width, height); +} + +LIBYUV_API +void RotatePlane270(const uint8_t *src, + int src_stride, + uint8_t *dst, + int dst_stride, + int width, + int height) { + // Rotate by 270 is a transpose with the destination written + // from bottom to top. So set the destination pointer to the end + // of the buffer and flip the sign of the destination stride. + dst += dst_stride * (width - 1); + dst_stride = -dst_stride; + TransposePlane(src, src_stride, dst, dst_stride, width, height); +} + +LIBYUV_API +void RotatePlane180(const uint8_t *src, + int src_stride, + uint8_t *dst, + int dst_stride, + int width, + int height) { + // Swap top and bottom row and mirror the content. Uses a temporary row. + align_buffer_64(row, width); + const uint8_t *src_bot = src + src_stride * (height - 1); + uint8_t *dst_bot = dst + dst_stride * (height - 1); + int half_height = (height + 1) >> 1; + int y; + void (*MirrorRow)(const uint8_t *src, uint8_t *dst, int width) = MirrorRow_C; + void (*CopyRow)(const uint8_t *src, uint8_t *dst, int width) = CopyRow_C; +#if defined(HAS_MIRRORROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + MirrorRow = MirrorRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + MirrorRow = MirrorRow_SSSE3; + } + } +#endif +#if defined(HAS_MIRRORROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MirrorRow = MirrorRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + MirrorRow = MirrorRow_AVX2; + } + } +#endif +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; + } +#endif +#if defined(HAS_COPYROW_AVX) + if (TestCpuFlag(kCpuHasAVX)) { + CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; + } +#endif +#if defined(HAS_COPYROW_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + CopyRow = CopyRow_ERMS; + } +#endif +#if defined(HAS_COPYROW_NEON) +#endif + // Odd height will harmlessly mirror the middle row twice. + for (y = 0; y < half_height; ++y) { + CopyRow(src, row, width); // Copy top row into buffer + MirrorRow(src_bot, dst, width); // Mirror bottom row into top row + MirrorRow(row, dst_bot, width); // Mirror buffer into bottom row + src += src_stride; + dst += dst_stride; + src_bot -= src_stride; + dst_bot -= dst_stride; + } + free_aligned_buffer_64(row); +} + +LIBYUV_API +int I420Rotate(const uint8_t *src_y, + int src_stride_y, + const uint8_t *src_u, + int src_stride_u, + const uint8_t *src_v, + int src_stride_v, + uint8_t *dst_y, + int dst_stride_y, + uint8_t *dst_u, + int dst_stride_u, + uint8_t *dst_v, + int dst_stride_v, + int width, + int height, + enum RotationMode mode) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if ((!src_y && dst_y) || !src_u || !src_v || width <= 0 || height == 0 || + !dst_y || !dst_u || !dst_v) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + switch (mode) { + case kRotate0: + // copy frame + return I420Copy(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height); + case kRotate90: + RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, + halfheight); + RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, + halfheight); + return 0; + case kRotate270: + RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, + halfheight); + RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, + halfheight); + return 0; + case kRotate180: + RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, + halfheight); + RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, + halfheight); + return 0; + default: + break; + } + return -1; +} diff --git a/pkg/encoder/yuv/libyuv/rotate.h b/pkg/encoder/yuv/libyuv/rotate.h new file mode 100644 index 000000000..59b9ec3cb --- /dev/null +++ b/pkg/encoder/yuv/libyuv/rotate.h @@ -0,0 +1,79 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_ROTATE_H_ +#define INCLUDE_LIBYUV_ROTATE_H_ + +#include "basic_types.h" + +// Supported rotation. +typedef enum RotationMode { + kRotate0 = 0, // No rotation. + kRotate90 = 90, // Rotate 90 degrees clockwise. + kRotate180 = 180, // Rotate 180 degrees. + kRotate270 = 270, // Rotate 270 degrees clockwise. +} RotationModeEnum; + +// Rotate I420 frame. +LIBYUV_API +int I420Rotate(const uint8_t *src_y, + int src_stride_y, + const uint8_t *src_u, + int src_stride_u, + const uint8_t *src_v, + int src_stride_v, + uint8_t *dst_y, + int dst_stride_y, + uint8_t *dst_u, + int dst_stride_u, + uint8_t *dst_v, + int dst_stride_v, + int width, + int height, + enum RotationMode mode); + +// Rotate planes by 90, 180, 270. Deprecated. +LIBYUV_API +void RotatePlane90(const uint8_t *src, + int src_stride, + uint8_t *dst, + int dst_stride, + int width, + int height); + +LIBYUV_API +void RotatePlane180(const uint8_t *src, + int src_stride, + uint8_t *dst, + int dst_stride, + int width, + int height); + +LIBYUV_API +void RotatePlane270(const uint8_t *src, + int src_stride, + uint8_t *dst, + int dst_stride, + int width, + int height); + +// The 90 and 270 functions are based on transposes. +// Doing a transpose with reversing the read/write +// order will result in a rotation by +- 90 degrees. +// Deprecated. +LIBYUV_API +void TransposePlane(const uint8_t *src, + int src_stride, + uint8_t *dst, + int dst_stride, + int width, + int height); + +#endif // INCLUDE_LIBYUV_ROTATE_H_ diff --git a/pkg/encoder/yuv/libyuv/rotate_any.c b/pkg/encoder/yuv/libyuv/rotate_any.c new file mode 100644 index 000000000..9af8c04ab --- /dev/null +++ b/pkg/encoder/yuv/libyuv/rotate_any.c @@ -0,0 +1,54 @@ +/* + * Copyright 2015 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "rotate_row.h" + +#define TANY(NAMEANY, TPOS_SIMD, MASK) \ + void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst, \ + int dst_stride, int width) { \ + int r = width & MASK; \ + int n = width - r; \ + if (n > 0) { \ + TPOS_SIMD(src, src_stride, dst, dst_stride, n); \ + } \ + TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \ + } + +#ifdef HAS_TRANSPOSEWX8_SSSE3 + +TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7) + +#endif +#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3 + +TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15) + +#endif +#undef TANY + +#define TUVANY(NAMEANY, TPOS_SIMD, MASK) \ + void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst_a, \ + int dst_stride_a, uint8_t* dst_b, int dst_stride_b, \ + int width) { \ + int r = width & MASK; \ + int n = width - r; \ + if (n > 0) { \ + TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, n); \ + } \ + TransposeUVWx8_C(src + n * 2, src_stride, dst_a + n * dst_stride_a, \ + dst_stride_a, dst_b + n * dst_stride_b, dst_stride_b, r); \ + } + +#ifdef HAS_TRANSPOSEUVWX8_SSE2 + +TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7) + +#endif +#undef TUVANY diff --git a/pkg/encoder/yuv/libyuv/rotate_common.c b/pkg/encoder/yuv/libyuv/rotate_common.c new file mode 100644 index 000000000..20c1481a7 --- /dev/null +++ b/pkg/encoder/yuv/libyuv/rotate_common.c @@ -0,0 +1,77 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "rotate_row.h" + +void TransposeWx8_C(const uint8_t *src, + int src_stride, + uint8_t *dst, + int dst_stride, + int width) { + int i; + for (i = 0; i < width; ++i) { + dst[0] = src[0 * src_stride]; + dst[1] = src[1 * src_stride]; + dst[2] = src[2 * src_stride]; + dst[3] = src[3 * src_stride]; + dst[4] = src[4 * src_stride]; + dst[5] = src[5 * src_stride]; + dst[6] = src[6 * src_stride]; + dst[7] = src[7 * src_stride]; + ++src; + dst += dst_stride; + } +} + +void TransposeUVWx8_C(const uint8_t *src, + int src_stride, + uint8_t *dst_a, + int dst_stride_a, + uint8_t *dst_b, + int dst_stride_b, + int width) { + int i; + for (i = 0; i < width; ++i) { + dst_a[0] = src[0 * src_stride + 0]; + dst_b[0] = src[0 * src_stride + 1]; + dst_a[1] = src[1 * src_stride + 0]; + dst_b[1] = src[1 * src_stride + 1]; + dst_a[2] = src[2 * src_stride + 0]; + dst_b[2] = src[2 * src_stride + 1]; + dst_a[3] = src[3 * src_stride + 0]; + dst_b[3] = src[3 * src_stride + 1]; + dst_a[4] = src[4 * src_stride + 0]; + dst_b[4] = src[4 * src_stride + 1]; + dst_a[5] = src[5 * src_stride + 0]; + dst_b[5] = src[5 * src_stride + 1]; + dst_a[6] = src[6 * src_stride + 0]; + dst_b[6] = src[6 * src_stride + 1]; + dst_a[7] = src[7 * src_stride + 0]; + dst_b[7] = src[7 * src_stride + 1]; + src += 2; + dst_a += dst_stride_a; + dst_b += dst_stride_b; + } +} + +void TransposeWxH_C(const uint8_t *src, + int src_stride, + uint8_t *dst, + int dst_stride, + int width, + int height) { + int i; + for (i = 0; i < width; ++i) { + int j; + for (j = 0; j < height; ++j) { + dst[i * dst_stride + j] = src[j * src_stride + i]; + } + } +} diff --git a/pkg/encoder/yuv/libyuv/rotate_gcc.c b/pkg/encoder/yuv/libyuv/rotate_gcc.c new file mode 100644 index 000000000..54fdafff8 --- /dev/null +++ b/pkg/encoder/yuv/libyuv/rotate_gcc.c @@ -0,0 +1,370 @@ +/* + * Copyright 2015 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "rotate_row.h" +#include "row.h" + +// This module is for GCC x86 and x64. +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) + +// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit. +#if defined(HAS_TRANSPOSEWX8_SSSE3) + +void TransposeWx8_SSSE3(const uint8_t *src, + int src_stride, + uint8_t *dst, + int dst_stride, + int width) { + asm volatile( + // Read in the data from the source pointer. + // First round of bit swap. + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "movq (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "movq (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "movq (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movq (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "movq (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movq (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "lea 0x8(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "neg %3 \n" + // Second round of bit swap. + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + // Third round of bit swap. + // Write to the destination pointer. + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "sub $0x8,%2 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((intptr_t) (src_stride)), // %3 + "r"((intptr_t) (dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} + +#endif // defined(HAS_TRANSPOSEWX8_SSSE3) + +// Transpose 16x8. 64 bit +#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) + +void TransposeWx8_Fast_SSSE3(const uint8_t *src, + int src_stride, + uint8_t *dst, + int dst_stride, + int width) { + asm volatile( + // Read in the data from the source pointer. + // First round of bit swap. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqu (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm8,%%xmm9 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "palignr $0x8,%%xmm9,%%xmm9 \n" + "movdqu (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm2,%%xmm10 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm10 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movdqa %%xmm10,%%xmm11 \n" + "movdqu (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "movdqu (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm4,%%xmm12 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm12 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movdqa %%xmm12,%%xmm13 \n" + "movdqu (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movdqu (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm6,%%xmm14 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "punpckhbw %%xmm7,%%xmm14 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "movdqa %%xmm14,%%xmm15 \n" + "lea 0x10(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "neg %3 \n" + // Second round of bit swap. + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "punpcklwd %%xmm10,%%xmm8 \n" + "punpcklwd %%xmm11,%%xmm9 \n" + "movdqa %%xmm8,%%xmm10 \n" + "movdqa %%xmm9,%%xmm11 \n" + "palignr $0x8,%%xmm10,%%xmm10 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "punpcklwd %%xmm14,%%xmm12 \n" + "punpcklwd %%xmm15,%%xmm13 \n" + "movdqa %%xmm12,%%xmm14 \n" + "movdqa %%xmm13,%%xmm15 \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + // Third round of bit swap. + // Write to the destination pointer. + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm12,%%xmm8 \n" + "movq %%xmm8,(%1) \n" + "movdqa %%xmm8,%%xmm12 \n" + "palignr $0x8,%%xmm12,%%xmm12 \n" + "movq %%xmm12,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm14,%%xmm10 \n" + "movdqa %%xmm10,%%xmm14 \n" + "movq %%xmm10,(%1) \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "punpckldq %%xmm13,%%xmm9 \n" + "movq %%xmm14,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm9,%%xmm13 \n" + "movq %%xmm9,(%1) \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movq %%xmm13,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm15,%%xmm11 \n" + "movq %%xmm11,(%1) \n" + "movdqa %%xmm11,%%xmm15 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "sub $0x10,%2 \n" + "movq %%xmm15,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((intptr_t) (src_stride)), // %3 + "r"((intptr_t) (dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", + "xmm15"); +} + +#endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3) + +// Transpose UV 8x8. 64 bit. +#if defined(HAS_TRANSPOSEUVWX8_SSE2) + +void TransposeUVWx8_SSE2(const uint8_t *src, + int src_stride, + uint8_t *dst_a, + int dst_stride_a, + uint8_t *dst_b, + int dst_stride_b, + int width) { + asm volatile( + // Read in the data from the source pointer. + // First round of bit swap. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%0,%4),%%xmm1 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqa %%xmm8,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "movdqu (%0,%4),%%xmm3 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm8 \n" + "movdqa %%xmm8,%%xmm3 \n" + "movdqu (%0),%%xmm4 \n" + "movdqu (%0,%4),%%xmm5 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm4,%%xmm8 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm8 \n" + "movdqa %%xmm8,%%xmm5 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu (%0,%4),%%xmm7 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm6,%%xmm8 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %4 \n" + "lea 0x10(%0,%4,8),%0 \n" + "punpckhbw %%xmm7,%%xmm8 \n" + "movdqa %%xmm8,%%xmm7 \n" + "neg %4 \n" + // Second round of bit swap. + "movdqa %%xmm0,%%xmm8 \n" + "movdqa %%xmm1,%%xmm9 \n" + "punpckhwd %%xmm2,%%xmm8 \n" + "punpckhwd %%xmm3,%%xmm9 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm8,%%xmm2 \n" + "movdqa %%xmm9,%%xmm3 \n" + "movdqa %%xmm4,%%xmm8 \n" + "movdqa %%xmm5,%%xmm9 \n" + "punpckhwd %%xmm6,%%xmm8 \n" + "punpckhwd %%xmm7,%%xmm9 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm8,%%xmm6 \n" + "movdqa %%xmm9,%%xmm7 \n" + // Third round of bit swap. + // Write to the destination pointer. + "movdqa %%xmm0,%%xmm8 \n" + "punpckldq %%xmm4,%%xmm0 \n" + "movlpd %%xmm0,(%1) \n" // Write back U channel + "movhpd %%xmm0,(%2) \n" // Write back V channel + "punpckhdq %%xmm4,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movlpd %%xmm2,(%1) \n" + "movhpd %%xmm2,(%2) \n" + "punpckhdq %%xmm6,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm1,%%xmm8 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movlpd %%xmm1,(%1) \n" + "movhpd %%xmm1,(%2) \n" + "punpckhdq %%xmm5,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm3,%%xmm8 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movlpd %%xmm3,(%1) \n" + "movhpd %%xmm3,(%2) \n" + "punpckhdq %%xmm7,%%xmm8 \n" + "sub $0x8,%3 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst_a), // %1 + "+r"(dst_b), // %2 + "+r"(width) // %3 + : "r"((intptr_t) (src_stride)), // %4 + "r"((intptr_t) (dst_stride_a)), // %5 + "r"((intptr_t) (dst_stride_b)) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7", "xmm8", "xmm9"); +} + +#endif // defined(HAS_TRANSPOSEUVWX8_SSE2) + +#endif // defined(__x86_64__) || defined(__i386__) diff --git a/pkg/encoder/yuv/libyuv/rotate_row.h b/pkg/encoder/yuv/libyuv/rotate_row.h new file mode 100644 index 000000000..afdae49f0 --- /dev/null +++ b/pkg/encoder/yuv/libyuv/rotate_row.h @@ -0,0 +1,106 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_ +#define INCLUDE_LIBYUV_ROTATE_ROW_H_ + +#include "basic_types.h" + +#if defined(__pnacl__) || defined(__CLR_VER) || \ + (defined(__native_client__) && defined(__x86_64__)) || \ + (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) +#define LIBYUV_DISABLE_X86 +#endif +#if defined(__native_client__) +#define LIBYUV_DISABLE_NEON +#endif +// MemorySanitizer does not support assembly code yet. http://crbug.com/344505 +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON) +#define LIBYUV_DISABLE_NEON +#endif +#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86) +#define LIBYUV_DISABLE_X86 +#endif +#endif + +// The following are available for GCC 32 or 64 bit: +#if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__)) +#define HAS_TRANSPOSEWX8_SSSE3 +#endif + +// The following are available for 64 bit GCC: +#if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__) +#define HAS_TRANSPOSEWX8_FAST_SSSE3 +#define HAS_TRANSPOSEUVWX8_SSE2 +#endif + +void TransposeWxH_C(const uint8_t *src, + int src_stride, + uint8_t *dst, + int dst_stride, + int width, + int height); + +void TransposeWx8_C(const uint8_t *src, + int src_stride, + uint8_t *dst, + int dst_stride, + int width); + +void TransposeWx8_SSSE3(const uint8_t *src, + int src_stride, + uint8_t *dst, + int dst_stride, + int width); + +void TransposeWx8_Fast_SSSE3(const uint8_t *src, + int src_stride, + uint8_t *dst, + int dst_stride, + int width); + +void TransposeWx8_Any_SSSE3(const uint8_t *src, + int src_stride, + uint8_t *dst, + int dst_stride, + int width); + +void TransposeWx8_Fast_Any_SSSE3(const uint8_t *src, + int src_stride, + uint8_t *dst, + int dst_stride, + int width); + +void TransposeUVWx8_C(const uint8_t *src, + int src_stride, + uint8_t *dst_a, + int dst_stride_a, + uint8_t *dst_b, + int dst_stride_b, + int width); + +void TransposeUVWx8_SSE2(const uint8_t *src, + int src_stride, + uint8_t *dst_a, + int dst_stride_a, + uint8_t *dst_b, + int dst_stride_b, + int width); + +void TransposeUVWx8_Any_SSE2(const uint8_t *src, + int src_stride, + uint8_t *dst_a, + int dst_stride_a, + uint8_t *dst_b, + int dst_stride_b, + int width); + +#endif // INCLUDE_LIBYUV_ROTATE_ROW_H_ diff --git a/pkg/encoder/yuv/libyuv/row.h b/pkg/encoder/yuv/libyuv/row.h new file mode 100644 index 000000000..ca1c0c298 --- /dev/null +++ b/pkg/encoder/yuv/libyuv/row.h @@ -0,0 +1,426 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_ROW_H_ +#define INCLUDE_LIBYUV_ROW_H_ + +#include // For NULL +#include // For malloc + +#include "basic_types.h" + +#if defined(__pnacl__) || defined(__CLR_VER) || \ + (defined(__native_client__) && defined(__x86_64__)) || \ + (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) +#define LIBYUV_DISABLE_X86 +#endif +#if defined(__native_client__) +#define LIBYUV_DISABLE_NEON +#endif +// MemorySanitizer does not support assembly code yet. http://crbug.com/344505 +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON) +#define LIBYUV_DISABLE_NEON +#endif +#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86) +#define LIBYUV_DISABLE_X86 +#endif +#endif + +// GCC >= 4.7.0 required for AVX2. +#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) +#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) +#define GCC_HAS_AVX2 1 +#endif // GNUC >= 4.7 +#endif // __GNUC__ + +// The following are available on all x86 platforms: +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +// Conversions: +#define HAS_ABGRTOYROW_SSSE3 +#define HAS_ARGBTOYROW_SSSE3 +#define HAS_BGRATOYROW_SSSE3 +#define HAS_COPYROW_ERMS +#define HAS_COPYROW_SSE2 +#define HAS_INTERPOLATEROW_SSSE3 +#define HAS_MIRRORROW_SSSE3 +#define HAS_MIRRORSPLITUVROW_SSSE3 +#if !defined(LIBYUV_BIT_EXACT) +#define HAS_ABGRTOUVROW_SSSE3 +#define HAS_ARGBTOUVROW_SSSE3 +#endif + +// Effects: +#define HAS_ARGBGRAYROW_SSSE3 +#define HAS_ARGBMIRRORROW_SSE2 + +#endif + +// The following are available on all x86 platforms, but +// require VS2012, clang 3.4 or gcc 4.7. +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \ + defined(GCC_HAS_AVX2)) +#define HAS_ARGBEXTRACTALPHAROW_AVX2 +#define HAS_ARGBMIRRORROW_AVX2 +#define HAS_ARGBTOYROW_AVX2 +#define HAS_COPYROW_AVX +#define HAS_INTERPOLATEROW_AVX2 +#define HAS_MIRRORROW_AVX2 +#if !defined(LIBYUV_BIT_EXACT) +#define HAS_ARGBTOUVROW_AVX2 +#endif + +#endif + +// The following are available for gcc/clang x86 platforms: +// TODO(fbarchard): Port to Visual C +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) +#define HAS_MIRRORUVROW_SSSE3 + +#endif + +// The following are available for AVX2 gcc/clang x86 platforms: +// TODO(fbarchard): Port to Visual C +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || defined(__i386__)) && \ + (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) +#define HAS_ABGRTOYROW_AVX2 +#define HAS_MIRRORUVROW_AVX2 +#if !defined(LIBYUV_BIT_EXACT) +#define HAS_ABGRTOUVROW_AVX2 +#endif + +#endif + +#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) + #if defined(VISUALC_HAS_AVX2) +#define SIMD_ALIGNED(var) __declspec(align(32)) var +#else +#define SIMD_ALIGNED(var) __declspec(align(16)) var +#endif +#define LIBYUV_NOINLINE __declspec(noinline) +typedef __declspec(align(16)) int16_t vec16[8]; +typedef __declspec(align(16)) int32_t vec32[4]; +typedef __declspec(align(16)) float vecf32[4]; +typedef __declspec(align(16)) int8_t vec8[16]; +typedef __declspec(align(16)) uint16_t uvec16[8]; +typedef __declspec(align(16)) uint32_t uvec32[4]; +typedef __declspec(align(16)) uint8_t uvec8[16]; +typedef __declspec(align(32)) int16_t lvec16[16]; +typedef __declspec(align(32)) int32_t lvec32[8]; +typedef __declspec(align(32)) int8_t lvec8[32]; +typedef __declspec(align(32)) uint16_t ulvec16[16]; +typedef __declspec(align(32)) uint32_t ulvec32[8]; +typedef __declspec(align(32)) uint8_t ulvec8[32]; +#elif !defined(__pnacl__) && (defined(__GNUC__) || defined(__clang__)) +// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const. +#if defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2) +#define SIMD_ALIGNED(var) var __attribute__((aligned(32))) +#else +#define SIMD_ALIGNED(var) var __attribute__((aligned(16))) +#endif +#define LIBYUV_NOINLINE __attribute__((noinline)) +typedef int16_t __attribute__((vector_size(16))) vec16; +typedef int32_t __attribute__((vector_size(16))) vec32; +typedef float __attribute__((vector_size(16))) vecf32; +typedef int8_t __attribute__((vector_size(16))) vec8; +typedef uint16_t __attribute__((vector_size(16))) uvec16; +typedef uint32_t __attribute__((vector_size(16))) uvec32; +typedef uint8_t __attribute__((vector_size(16))) uvec8; +typedef int16_t __attribute__((vector_size(32))) lvec16; +typedef int32_t __attribute__((vector_size(32))) lvec32; +typedef int8_t __attribute__((vector_size(32))) lvec8; +typedef uint16_t __attribute__((vector_size(32))) ulvec16; +typedef uint32_t __attribute__((vector_size(32))) ulvec32; +typedef uint8_t __attribute__((vector_size(32))) ulvec8; +#else +#define SIMD_ALIGNED(var) var +#define LIBYUV_NOINLINE +typedef int16_t vec16[8]; +typedef int32_t vec32[4]; +typedef float vecf32[4]; +typedef int8_t vec8[16]; +typedef uint16_t uvec16[8]; +typedef uint32_t uvec32[4]; +typedef uint8_t uvec8[16]; +typedef int16_t lvec16[16]; +typedef int32_t lvec32[8]; +typedef int8_t lvec8[32]; +typedef uint16_t ulvec16[16]; +typedef uint32_t ulvec32[8]; +typedef uint8_t ulvec8[32]; +#endif + +#if !defined(__aarch64__) || !defined(__arm__) +// This struct is for Intel color conversion. +struct YuvConstants { + uint8_t kUVToB[32]; + uint8_t kUVToG[32]; + uint8_t kUVToR[32]; + int16_t kYToRgb[16]; + int16_t kYBiasToRgb[16]; +}; +#endif + +#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1))) + +#define align_buffer_64(var, size) \ + void* var##_mem = malloc((size) + 63); /* NOLINT */ \ + uint8_t* var = (uint8_t*)(((intptr_t)var##_mem + 63) & ~63) /* NOLINT */ + +#define free_aligned_buffer_64(var) \ + free(var##_mem); \ + var = NULL + +#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__) +#define OMITFP +#else +#define OMITFP __attribute__((optimize("omit-frame-pointer"))) +#endif + +// NaCL macros for GCC x86 and x64. +#if defined(__native_client__) +#define LABELALIGN ".p2align 5\n" +#else +#define LABELALIGN +#endif + +void ARGBToYRow_AVX2(const uint8_t *src_argb, uint8_t *dst_y, int width); + +void ARGBToYRow_Any_AVX2(const uint8_t *src_ptr, uint8_t *dst_ptr, int width); + +void ABGRToYRow_AVX2(const uint8_t *src_abgr, uint8_t *dst_y, int width); + +void ABGRToYRow_Any_AVX2(const uint8_t *src_ptr, uint8_t *dst_ptr, int width); + +void ARGBToYRow_SSSE3(const uint8_t *src_argb, uint8_t *dst_y, int width); + +void ABGRToYRow_SSSE3(const uint8_t *src_abgr, uint8_t *dst_y, int width); + +void BGRAToYRow_SSSE3(const uint8_t *src_bgra, uint8_t *dst_y, int width); + +void ABGRToYRow_SSSE3(const uint8_t *src_abgr, uint8_t *dst_y, int width); + +void ARGBToYRow_C(const uint8_t *src_rgb, uint8_t *dst_y, int width); + +void ABGRToYRow_C(const uint8_t *src_rgb, uint8_t *dst_y, int width); + +void RGB565ToYRow_C(const uint8_t *src_rgb565, uint8_t *dst_y, int width); + +void ARGBToYRow_Any_SSSE3(const uint8_t *src_ptr, uint8_t *dst_ptr, int width); + +void BGRAToYRow_Any_SSSE3(const uint8_t *src_ptr, uint8_t *dst_ptr, int width); + +void ABGRToYRow_Any_SSSE3(const uint8_t *src_ptr, uint8_t *dst_ptr, int width); + +void ARGBToUVRow_AVX2(const uint8_t *src_argb, + int src_stride_argb, + uint8_t *dst_u, + uint8_t *dst_v, + int width); + +void ABGRToUVRow_AVX2(const uint8_t *src_abgr, + int src_stride_abgr, + uint8_t *dst_u, + uint8_t *dst_v, + int width); + +void ARGBToUVRow_SSSE3(const uint8_t *src_argb, + int src_stride_argb, + uint8_t *dst_u, + uint8_t *dst_v, + int width); + +void BGRAToUVRow_SSSE3(const uint8_t *src_bgra, + int src_stride_bgra, + uint8_t *dst_u, + uint8_t *dst_v, + int width); + +void ABGRToUVRow_SSSE3(const uint8_t *src_abgr, + int src_stride_abgr, + uint8_t *dst_u, + uint8_t *dst_v, + int width); + +void RGBAToUVRow_SSSE3(const uint8_t *src_rgba, + int src_stride_rgba, + uint8_t *dst_u, + uint8_t *dst_v, + int width); + +void ARGBToUVRow_Any_AVX2(const uint8_t *src_ptr, + int src_stride, + uint8_t *dst_u, + uint8_t *dst_v, + int width); + +void ABGRToUVRow_Any_AVX2(const uint8_t *src_ptr, + int src_stride, + uint8_t *dst_u, + uint8_t *dst_v, + int width); + +void ARGBToUVRow_Any_SSSE3(const uint8_t *src_ptr, + int src_stride, + uint8_t *dst_u, + uint8_t *dst_v, + int width); + +void BGRAToUVRow_Any_SSSE3(const uint8_t *src_ptr, + int src_stride, + uint8_t *dst_u, + uint8_t *dst_v, + int width); + +void ABGRToUVRow_Any_SSSE3(const uint8_t *src_ptr, + int src_stride, + uint8_t *dst_u, + uint8_t *dst_v, + int width); + +void RGBAToUVRow_Any_SSSE3(const uint8_t *src_ptr, + int src_stride, + uint8_t *dst_u, + uint8_t *dst_v, + int width); + +void ARGBToUVRow_C(const uint8_t *src_rgb, + int src_stride_rgb, + uint8_t *dst_u, + uint8_t *dst_v, + int width); + +void ARGBToUVRow_C(const uint8_t *src_rgb, + int src_stride_rgb, + uint8_t *dst_u, + uint8_t *dst_v, + int width); + +void BGRAToUVRow_C(const uint8_t *src_rgb, + int src_stride_rgb, + uint8_t *dst_u, + uint8_t *dst_v, + int width); + +void ABGRToUVRow_C(const uint8_t *src_rgb, + int src_stride_rgb, + uint8_t *dst_u, + uint8_t *dst_v, + int width); + +void RGBAToUVRow_C(const uint8_t *src_rgb, + int src_stride_rgb, + uint8_t *dst_u, + uint8_t *dst_v, + int width); + +void RGB565ToUVRow_C(const uint8_t *src_rgb565, + int src_stride_rgb565, + uint8_t *dst_u, + uint8_t *dst_v, + int width); + +void MirrorRow_AVX2(const uint8_t *src, uint8_t *dst, int width); + +void MirrorRow_SSSE3(const uint8_t *src, uint8_t *dst, int width); + +void MirrorRow_C(const uint8_t *src, uint8_t *dst, int width); + +void MirrorRow_Any_AVX2(const uint8_t *src_ptr, uint8_t *dst_ptr, int width); + +void MirrorRow_Any_SSSE3(const uint8_t *src_ptr, uint8_t *dst_ptr, int width); + +void MirrorRow_Any_SSE2(const uint8_t *src, uint8_t *dst, int width); + +void MirrorUVRow_AVX2(const uint8_t *src_uv, uint8_t *dst_uv, int width); + +void MirrorUVRow_SSSE3(const uint8_t *src_uv, uint8_t *dst_uv, int width); + +void MirrorUVRow_Any_AVX2(const uint8_t *src_ptr, uint8_t *dst_ptr, int width); + +void MirrorUVRow_Any_SSSE3(const uint8_t *src_ptr, uint8_t *dst_ptr, int width); + +void ARGBMirrorRow_AVX2(const uint8_t *src, uint8_t *dst, int width); + +void ARGBMirrorRow_SSE2(const uint8_t *src, uint8_t *dst, int width); + +void ARGBMirrorRow_C(const uint8_t *src, uint8_t *dst, int width); + +void ARGBMirrorRow_Any_AVX2(const uint8_t *src_ptr, + uint8_t *dst_ptr, + int width); + +void ARGBMirrorRow_Any_SSE2(const uint8_t *src_ptr, + uint8_t *dst_ptr, + int width); + +void CopyRow_SSE2(const uint8_t *src, uint8_t *dst, int width); + +void CopyRow_AVX(const uint8_t *src, uint8_t *dst, int width); + +void CopyRow_ERMS(const uint8_t *src, uint8_t *dst, int width); + +void CopyRow_C(const uint8_t *src, uint8_t *dst, int count); + +void CopyRow_Any_SSE2(const uint8_t *src_ptr, uint8_t *dst_ptr, int width); + +void CopyRow_Any_AVX(const uint8_t *src_ptr, uint8_t *dst_ptr, int width); + +void RGB565ToARGBRow_SSE2(const uint8_t *src, uint8_t *dst, int width); + +void RGB565ToARGBRow_AVX2(const uint8_t *src_rgb565, + uint8_t *dst_argb, + int width); + +void RGB565ToARGBRow_C(const uint8_t *src_rgb565, uint8_t *dst_argb, int width); + +void RGB565ToARGBRow_Any_SSE2(const uint8_t *src_ptr, + uint8_t *dst_ptr, + int width); + +void RGB565ToARGBRow_Any_AVX2(const uint8_t *src_ptr, + uint8_t *dst_ptr, + int width); + +// Used for I420Scale, ARGBScale, and ARGBInterpolate. +void InterpolateRow_C(uint8_t *dst_ptr, + const uint8_t *src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction); + +void InterpolateRow_SSSE3(uint8_t *dst_ptr, + const uint8_t *src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction); + +void InterpolateRow_AVX2(uint8_t *dst_ptr, + const uint8_t *src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction); + +void InterpolateRow_Any_SSSE3(uint8_t *dst_ptr, + const uint8_t *src_ptr, + ptrdiff_t src_stride_ptr, + int width, + int source_y_fraction); + +void InterpolateRow_Any_AVX2(uint8_t *dst_ptr, + const uint8_t *src_ptr, + ptrdiff_t src_stride_ptr, + int width, + int source_y_fraction); + +#endif // INCLUDE_LIBYUV_ROW_H_ \ No newline at end of file diff --git a/pkg/encoder/yuv/libyuv/row_any.c b/pkg/encoder/yuv/libyuv/row_any.c new file mode 100644 index 000000000..fcc49c672 --- /dev/null +++ b/pkg/encoder/yuv/libyuv/row_any.c @@ -0,0 +1,206 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "row.h" + +#include // For memset. + +// Subsampled source needs to be increase by 1 of not even. +#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift)) + +// Any 1 to 1. +#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t vin[128]); \ + SIMD_ALIGNED(uint8_t vout[128]); \ + memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, n); \ + } \ + memcpy(vin, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ + ANY_SIMD(vin, vout, MASK + 1); \ + memcpy(dst_ptr + n * BPP, vout, r * BPP); \ + } + +#ifdef HAS_COPYROW_AVX + +ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63) + +#endif +#ifdef HAS_COPYROW_SSE2 + +ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31) + +#endif + +#ifdef HAS_ARGBTOYROW_AVX2 + +ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31) + +#endif +#ifdef HAS_ABGRTOYROW_AVX2 + +ANY11(ABGRToYRow_Any_AVX2, ABGRToYRow_AVX2, 0, 4, 1, 31) + +#endif +#ifdef HAS_ARGBTOYROW_SSSE3 + +ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15) + +#endif +#ifdef HAS_BGRATOYROW_SSSE3 + +ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15) + +ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15) + +#endif + +#undef ANY11 + +// Any 1 to 1 interpolate. Takes 2 rows of source via stride. +#define ANY11I(NAMEANY, ANY_SIMD, TD, TS, SBPP, BPP, MASK) \ + void NAMEANY(TD* dst_ptr, const TS* src_ptr, ptrdiff_t src_stride, \ + int width, int source_y_fraction) { \ + SIMD_ALIGNED(TS vin[64 * 2]); \ + SIMD_ALIGNED(TD vout[64]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(dst_ptr, src_ptr, src_stride, n, source_y_fraction); \ + } \ + memcpy(vin, src_ptr + n * SBPP, r * SBPP * sizeof(TS)); \ + if (source_y_fraction) { \ + memcpy(vin + 64, src_ptr + src_stride + n * SBPP, \ + r * SBPP * sizeof(TS)); \ + } \ + ANY_SIMD(vout, vin, 64, MASK + 1, source_y_fraction); \ + memcpy(dst_ptr + n * BPP, vout, r * BPP * sizeof(TD)); \ + } + +#ifdef HAS_INTERPOLATEROW_AVX2 + +ANY11I(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, uint8_t, uint8_t, 1, 1, 31) + +#endif +#ifdef HAS_INTERPOLATEROW_SSSE3 + +ANY11I(InterpolateRow_Any_SSSE3, + InterpolateRow_SSSE3, + uint8_t, + uint8_t, + 1, + 1, + 15) + +#endif + +#undef ANY11I + +// Any 1 to 1 mirror. +#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t vin[64]); \ + SIMD_ALIGNED(uint8_t vout[64]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \ + } \ + memcpy(vin, src_ptr, r* BPP); \ + ANY_SIMD(vin, vout, MASK + 1); \ + memcpy(dst_ptr + n * BPP, vout + (MASK + 1 - r) * BPP, r * BPP); \ + } + +#ifdef HAS_MIRRORROW_AVX2 + +ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31) + +#endif +#ifdef HAS_MIRRORROW_SSSE3 + +ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15) + +#endif +#ifdef HAS_MIRRORUVROW_AVX2 + +ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15) + +#endif +#ifdef HAS_MIRRORUVROW_SSSE3 + +ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7) + +#endif +#ifdef HAS_ARGBMIRRORROW_AVX2 + +ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7) + +#endif +#ifdef HAS_ARGBMIRRORROW_SSE2 + +ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3) + +#endif +#undef ANY11M + +// Any 1 to 2 with source stride (2 rows of source). Outputs UV planes. +// 128 byte row allows for 32 avx ARGB pixels. +#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, \ + uint8_t* dst_v, int width) { \ + SIMD_ALIGNED(uint8_t vin[128 * 2]); \ + SIMD_ALIGNED(uint8_t vout[128 * 2]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, src_stride, dst_u, dst_v, n); \ + } \ + memcpy(vin, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ + memcpy(vin + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP, \ + SS(r, UVSHIFT) * BPP); \ + if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \ + memcpy(vin + SS(r, UVSHIFT) * BPP, vin + SS(r, UVSHIFT) * BPP - BPP, \ + BPP); \ + memcpy(vin + 128 + SS(r, UVSHIFT) * BPP, \ + vin + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \ + } \ + ANY_SIMD(vin, 128, vout, vout + 128, MASK + 1); \ + memcpy(dst_u + (n >> 1), vout, SS(r, 1)); \ + memcpy(dst_v + (n >> 1), vout + 128, SS(r, 1)); \ + } + +#ifdef HAS_ARGBTOUVROW_AVX2 + +ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31) + +#endif +#ifdef HAS_ABGRTOUVROW_AVX2 + +ANY12S(ABGRToUVRow_Any_AVX2, ABGRToUVRow_AVX2, 0, 4, 31) + +#endif +#ifdef HAS_ARGBTOUVROW_SSSE3 + +ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15) + +ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15) + +ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15) + +ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15) + +#endif +#undef ANY12S diff --git a/pkg/encoder/yuv/libyuv/row_common.c b/pkg/encoder/yuv/libyuv/row_common.c new file mode 100644 index 000000000..34a93a074 --- /dev/null +++ b/pkg/encoder/yuv/libyuv/row_common.c @@ -0,0 +1,887 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "row.h" + +#include +#include // For memcpy and memset. + +#define STATIC_CAST(type, expr) (type)(expr) + +// This macro controls YUV to RGB using unsigned math to extend range of +// YUV to RGB coefficients to 0 to 4 instead of 0 to 2 for more accuracy on B: +// LIBYUV_UNLIMITED_DATA + +// Macros to enable unlimited data for each colorspace +// LIBYUV_UNLIMITED_BT601 +// LIBYUV_UNLIMITED_BT709 +// LIBYUV_UNLIMITED_BT2020 + +#if !defined(LIBYUV_BIT_EXACT) && (defined(__x86_64__) || defined(_M_X64) || \ + defined(__i386__) || defined(_M_IX86)) +#define LIBYUV_ARGBTOUV_PAVGB 1 +#define LIBYUV_RGBTOU_TRUNCATE 1 +#endif +#if defined(LIBYUV_BIT_EXACT) +#define LIBYUV_UNATTENUATE_DUP 1 +#endif + +// llvm x86 is poor at ternary operator, so use branchless min/max. + +#define USE_BRANCHLESS 1 +#if USE_BRANCHLESS + +static __inline int32_t clamp0(int32_t v) { + return -(v >= 0) & v; +} + +// TODO(fbarchard): make clamp255 preserve negative values. +static __inline int32_t clamp255(int32_t v) { + return (-(v >= 255) | v) & 255; +} + +static __inline int32_t clamp1023(int32_t v) { + return (-(v >= 1023) | v) & 1023; +} + +// clamp to max +static __inline int32_t ClampMax(int32_t v, int32_t max) { + return (-(v >= max) | v) & max; +} + +static __inline uint32_t Abs(int32_t v) { + int m = -(v < 0); + return (v + m) ^ m; +} + +#else // USE_BRANCHLESS +static __inline int32_t clamp0(int32_t v) { + return (v < 0) ? 0 : v; +} + +static __inline int32_t clamp255(int32_t v) { + return (v > 255) ? 255 : v; +} + +static __inline int32_t clamp1023(int32_t v) { + return (v > 1023) ? 1023 : v; +} + +static __inline int32_t ClampMax(int32_t v, int32_t max) { + return (v > max) ? max : v; +} + +static __inline uint32_t Abs(int32_t v) { + return (v < 0) ? -v : v; +} +#endif // USE_BRANCHLESS + +static __inline uint32_t Clamp(int32_t val) { + int v = clamp0(val); + return (uint32_t) (clamp255(v)); +} + +static __inline uint32_t Clamp10(int32_t val) { + int v = clamp0(val); + return (uint32_t) (clamp1023(v)); +} + +// Little Endian +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \ + defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) || \ + (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#define WRITEWORD(p, v) *(uint32_t*)(p) = v +#else +static inline void WRITEWORD(uint8_t* p, uint32_t v) { + p[0] = (uint8_t)(v & 255); + p[1] = (uint8_t)((v >> 8) & 255); + p[2] = (uint8_t)((v >> 16) & 255); + p[3] = (uint8_t)((v >> 24) & 255); +} +#endif + +void RGB565ToARGBRow_C(const uint8_t *src_rgb565, + uint8_t *dst_argb, + int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t b = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f); + uint8_t g = STATIC_CAST( + uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3)); + uint8_t r = STATIC_CAST(uint8_t, src_rgb565[1] >> 3); + dst_argb[0] = STATIC_CAST(uint8_t, (b << 3) | (b >> 2)); + dst_argb[1] = STATIC_CAST(uint8_t, (g << 2) | (g >> 4)); + dst_argb[2] = STATIC_CAST(uint8_t, (r << 3) | (r >> 2)); + dst_argb[3] = 255u; + dst_argb += 4; + src_rgb565 += 2; + } +} + +// 8 bit +// Intel SSE/AVX uses the following equivalent formula +// 0x7e80 = (66 + 129 + 25) * -128 + 0x1000 (for +16) and 0x0080 for round. +// return (66 * ((int)r - 128) + 129 * ((int)g - 128) + 25 * ((int)b - 128) + +// 0x7e80) >> 8; + +static __inline uint8_t RGBToY(uint8_t r, uint8_t g, uint8_t b) { + return STATIC_CAST(uint8_t, (66 * r + 129 * g + 25 * b + 0x1080) >> 8); +} + +#define AVGB(a, b) (((a) + (b) + 1) >> 1) + +// LIBYUV_RGBTOU_TRUNCATE mimics x86 code that does not round. +#ifdef LIBYUV_RGBTOU_TRUNCATE + +static __inline uint8_t RGBToU(uint8_t r, uint8_t g, uint8_t b) { + return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8000) >> 8); +} + +static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) { + return STATIC_CAST(uint8_t, (112 * r - 94 * g - 18 * b + 0x8000) >> 8); +} + +#else +// TODO(fbarchard): Add rounding to x86 SIMD and use this +static __inline uint8_t RGBToU(uint8_t r, uint8_t g, uint8_t b) { + return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8080) >> 8); +} +static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) { + return STATIC_CAST(uint8_t, (112 * r - 94 * g - 18 * b + 0x8080) >> 8); +} +#endif + +// LIBYUV_ARGBTOUV_PAVGB mimics x86 code that subsamples with 2 pavgb. +#if !defined(LIBYUV_ARGBTOUV_PAVGB) +static __inline int RGB2xToU(uint16_t r, uint16_t g, uint16_t b) { + return STATIC_CAST( + uint8_t, ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8); +} +static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) { + return STATIC_CAST( + uint8_t, ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8); +} +#endif + +// ARGBToY_C and ARGBToUV_C +// Intel version mimic SSE/AVX which does 2 pavgb +#if LIBYUV_ARGBTOUV_PAVGB +#define MAKEROWY(NAME, R, G, B, BPP) \ + void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]); \ + src_rgb += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]), \ + AVGB(src_rgb[B + BPP], src_rgb1[B + BPP])); \ + uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]), \ + AVGB(src_rgb[G + BPP], src_rgb1[G + BPP])); \ + uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]), \ + AVGB(src_rgb[R + BPP], src_rgb1[R + BPP])); \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + src_rgb += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]); \ + uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]); \ + uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]); \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + } \ + } +#else +// ARM version does sum / 2 then multiply by 2x smaller coefficients +#define MAKEROWY(NAME, R, G, B, BPP) \ + void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]); \ + src_rgb += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] + \ + src_rgb1[B + BPP] + 1) >> \ + 1; \ + uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] + \ + src_rgb1[G + BPP] + 1) >> \ + 1; \ + uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] + \ + src_rgb1[R + BPP] + 1) >> \ + 1; \ + dst_u[0] = RGB2xToU(ar, ag, ab); \ + dst_v[0] = RGB2xToV(ar, ag, ab); \ + src_rgb += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint16_t ab = src_rgb[B] + src_rgb1[B]; \ + uint16_t ag = src_rgb[G] + src_rgb1[G]; \ + uint16_t ar = src_rgb[R] + src_rgb1[R]; \ + dst_u[0] = RGB2xToU(ar, ag, ab); \ + dst_v[0] = RGB2xToV(ar, ag, ab); \ + } \ + } +#endif + +MAKEROWY(ARGB, 2, 1, 0, 4) + +MAKEROWY(BGRA, 1, 2, 3, 4) + +MAKEROWY(ABGR, 0, 1, 2, 4) + +MAKEROWY(RGBA, 3, 2, 1, 4) + +#undef MAKEROWY + +// JPeg uses a variation on BT.601-1 full range +// y = 0.29900 * r + 0.58700 * g + 0.11400 * b +// u = -0.16874 * r - 0.33126 * g + 0.50000 * b + center +// v = 0.50000 * r - 0.41869 * g - 0.08131 * b + center +// BT.601 Mpeg range uses: +// b 0.1016 * 255 = 25.908 = 25 +// g 0.5078 * 255 = 129.489 = 129 +// r 0.2578 * 255 = 65.739 = 66 +// JPeg 7 bit Y (deprecated) +// b 0.11400 * 128 = 14.592 = 15 +// g 0.58700 * 128 = 75.136 = 75 +// r 0.29900 * 128 = 38.272 = 38 +// JPeg 8 bit Y: +// b 0.11400 * 256 = 29.184 = 29 +// g 0.58700 * 256 = 150.272 = 150 +// r 0.29900 * 256 = 76.544 = 77 +// JPeg 8 bit U: +// b 0.50000 * 255 = 127.5 = 127 +// g -0.33126 * 255 = -84.4713 = -84 +// r -0.16874 * 255 = -43.0287 = -43 +// JPeg 8 bit V: +// b -0.08131 * 255 = -20.73405 = -20 +// g -0.41869 * 255 = -106.76595 = -107 +// r 0.50000 * 255 = 127.5 = 127 + +// 8 bit +static __inline uint8_t RGBToYJ(uint8_t r, uint8_t g, uint8_t b) { + return (77 * r + 150 * g + 29 * b + 128) >> 8; +} + +#if defined(LIBYUV_ARGBTOUV_PAVGB) + +static __inline uint8_t RGBToUJ(uint8_t r, uint8_t g, uint8_t b) { + return (127 * b - 84 * g - 43 * r + 0x8080) >> 8; +} + +static __inline uint8_t RGBToVJ(uint8_t r, uint8_t g, uint8_t b) { + return (127 * r - 107 * g - 20 * b + 0x8080) >> 8; +} + +#else +static __inline uint8_t RGB2xToUJ(uint16_t r, uint16_t g, uint16_t b) { + return ((127 / 2) * b - (84 / 2) * g - (43 / 2) * r + 0x8080) >> 8; +} +static __inline uint8_t RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) { + return ((127 / 2) * r - (107 / 2) * g - (20 / 2) * b + 0x8080) >> 8; +} +#endif + +// ARGBToYJ_C and ARGBToUVJ_C +// Intel version mimic SSE/AVX which does 2 pavgb +#if LIBYUV_ARGBTOUV_PAVGB +#define MAKEROWYJ(NAME, R, G, B, BPP) \ + void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]); \ + src_rgb += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]), \ + AVGB(src_rgb[B + BPP], src_rgb1[B + BPP])); \ + uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]), \ + AVGB(src_rgb[G + BPP], src_rgb1[G + BPP])); \ + uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]), \ + AVGB(src_rgb[R + BPP], src_rgb1[R + BPP])); \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ + src_rgb += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]); \ + uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]); \ + uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]); \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ + } \ + } +#else +// ARM version does sum / 2 then multiply by 2x smaller coefficients +#define MAKEROWYJ(NAME, R, G, B, BPP) \ + void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]); \ + src_rgb += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] + \ + src_rgb1[B + BPP] + 1) >> \ + 1; \ + uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] + \ + src_rgb1[G + BPP] + 1) >> \ + 1; \ + uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] + \ + src_rgb1[R + BPP] + 1) >> \ + 1; \ + dst_u[0] = RGB2xToUJ(ar, ag, ab); \ + dst_v[0] = RGB2xToVJ(ar, ag, ab); \ + src_rgb += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint16_t ab = (src_rgb[B] + src_rgb1[B]); \ + uint16_t ag = (src_rgb[G] + src_rgb1[G]); \ + uint16_t ar = (src_rgb[R] + src_rgb1[R]); \ + dst_u[0] = RGB2xToUJ(ar, ag, ab); \ + dst_v[0] = RGB2xToVJ(ar, ag, ab); \ + } \ + } + +#endif + +MAKEROWYJ(ARGB, 2, 1, 0, 4) + +MAKEROWYJ(ABGR, 0, 1, 2, 4) + +MAKEROWYJ(RGBA, 3, 2, 1, 4) + +MAKEROWYJ(RGB24, 2, 1, 0, 3) + +MAKEROWYJ(RAW, 0, 1, 2, 3) + +#undef MAKEROWYJ + +void RGB565ToYRow_C(const uint8_t *src_rgb565, uint8_t *dst_y, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t b = src_rgb565[0] & 0x1f; + uint8_t g = STATIC_CAST( + uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3)); + uint8_t r = src_rgb565[1] >> 3; + b = STATIC_CAST(uint8_t, (b << 3) | (b >> 2)); + g = STATIC_CAST(uint8_t, (g << 2) | (g >> 4)); + r = STATIC_CAST(uint8_t, (r << 3) | (r >> 2)); + dst_y[0] = RGBToY(r, g, b); + src_rgb565 += 2; + dst_y += 1; + } +} + +void RGB565ToUVRow_C(const uint8_t *src_rgb565, + int src_stride_rgb565, + uint8_t *dst_u, + uint8_t *dst_v, + int width) { + const uint8_t *next_rgb565 = src_rgb565 + src_stride_rgb565; + int x; + for (x = 0; x < width - 1; x += 2) { + uint8_t b0 = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f); + uint8_t g0 = STATIC_CAST( + uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3)); + uint8_t r0 = STATIC_CAST(uint8_t, src_rgb565[1] >> 3); + uint8_t b1 = STATIC_CAST(uint8_t, src_rgb565[2] & 0x1f); + uint8_t g1 = STATIC_CAST( + uint8_t, (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3)); + uint8_t r1 = STATIC_CAST(uint8_t, src_rgb565[3] >> 3); + uint8_t b2 = STATIC_CAST(uint8_t, next_rgb565[0] & 0x1f); + uint8_t g2 = STATIC_CAST( + uint8_t, (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3)); + uint8_t r2 = STATIC_CAST(uint8_t, next_rgb565[1] >> 3); + uint8_t b3 = STATIC_CAST(uint8_t, next_rgb565[2] & 0x1f); + uint8_t g3 = STATIC_CAST( + uint8_t, (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3)); + uint8_t r3 = STATIC_CAST(uint8_t, next_rgb565[3] >> 3); + + b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2)); + g0 = STATIC_CAST(uint8_t, (g0 << 2) | (g0 >> 4)); + r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2)); + b1 = STATIC_CAST(uint8_t, (b1 << 3) | (b1 >> 2)); + g1 = STATIC_CAST(uint8_t, (g1 << 2) | (g1 >> 4)); + r1 = STATIC_CAST(uint8_t, (r1 << 3) | (r1 >> 2)); + b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2)); + g2 = STATIC_CAST(uint8_t, (g2 << 2) | (g2 >> 4)); + r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2)); + b3 = STATIC_CAST(uint8_t, (b3 << 3) | (b3 >> 2)); + g3 = STATIC_CAST(uint8_t, (g3 << 2) | (g3 >> 4)); + r3 = STATIC_CAST(uint8_t, (r3 << 3) | (r3 >> 2)); + +#if LIBYUV_ARGBTOUV_PAVGB + uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3)); + uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3)); + uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3)); + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); +#else + uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1; + uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1; + uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1; + dst_u[0] = RGB2xToU(r, g, b); + dst_v[0] = RGB2xToV(r, g, b); +#endif + + src_rgb565 += 4; + next_rgb565 += 4; + dst_u += 1; + dst_v += 1; + } + if (width & 1) { + uint8_t b0 = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f); + uint8_t g0 = STATIC_CAST( + uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3)); + uint8_t r0 = STATIC_CAST(uint8_t, src_rgb565[1] >> 3); + uint8_t b2 = STATIC_CAST(uint8_t, next_rgb565[0] & 0x1f); + uint8_t g2 = STATIC_CAST( + uint8_t, (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3)); + uint8_t r2 = STATIC_CAST(uint8_t, next_rgb565[1] >> 3); + b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2)); + g0 = STATIC_CAST(uint8_t, (g0 << 2) | (g0 >> 4)); + r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2)); + b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2)); + g2 = STATIC_CAST(uint8_t, (g2 << 2) | (g2 >> 4)); + r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2)); + +#if LIBYUV_ARGBTOUV_PAVGB + uint8_t ab = AVGB(b0, b2); + uint8_t ag = AVGB(g0, g2); + uint8_t ar = AVGB(r0, r2); + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); +#else + uint16_t b = b0 + b2; + uint16_t g = g0 + g2; + uint16_t r = r0 + r2; + dst_u[0] = RGB2xToU(r, g, b); + dst_v[0] = RGB2xToV(r, g, b); +#endif + } +} + +#define REPEAT8(v) (v) | ((v) << 8) +#define SHADE(f, v) v* f >> 24 + +#undef REPEAT8 +#undef SHADE + +#define REPEAT8(v) (v) | ((v) << 8) +#define SHADE(f, v) v* f >> 16 + +#undef REPEAT8 +#undef SHADE + +#define SHADE(f, v) clamp255(v + f) + +#undef SHADE + +#define SHADE(f, v) clamp0(f - v) + +#undef SHADE + +// Macros to create SIMD specific yuv to rgb conversion constants. + +// clang-format off + +#if defined(__aarch64__) || defined(__arm__) +// Bias values include subtract 128 from U and V, bias from Y and rounding. +// For B and R bias is negative. For G bias is positive. +#define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR) \ + {{UB, VR, UG, VG, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, \ + {YG, (UB * 128 - YB), (UG * 128 + VG * 128 + YB), (VR * 128 - YB), YB, 0, \ + 0, 0}} +#else +#define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR) \ + {{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, \ + UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, \ + {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, \ + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \ + {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, \ + 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, \ + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \ + {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}} +#endif + +// clang-format on + +#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR) \ + const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = \ + YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR); \ + const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = \ + YUVCONSTANTSBODY(YG, YB, VR, VG, UG, UB); + +// TODO(fbarchard): Generate SIMD structures from float matrix. + +// BT.601 limited range YUV to RGB reference +// R = (Y - 16) * 1.164 + V * 1.596 +// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813 +// B = (Y - 16) * 1.164 + U * 2.018 +// KR = 0.299; KB = 0.114 + +// U and V contributions to R,G,B. +#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT601) +#define UB 129 /* round(2.018 * 64) */ +#else +#define UB 128 /* max(128, round(2.018 * 64)) */ +#endif +#define UG 25 /* round(0.391 * 64) */ +#define VG 52 /* round(0.813 * 64) */ +#define VR 102 /* round(1.596 * 64) */ + +// Y contribution to R,G,B. Scale and bias. +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ + +MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR) + +#undef YG +#undef YB +#undef UB +#undef UG +#undef VG +#undef VR + +// BT.601 full range YUV to RGB reference (aka JPEG) +// * R = Y + V * 1.40200 +// * G = Y - U * 0.34414 - V * 0.71414 +// * B = Y + U * 1.77200 +// KR = 0.299; KB = 0.114 + +// U and V contributions to R,G,B. +#define UB 113 /* round(1.77200 * 64) */ +#define UG 22 /* round(0.34414 * 64) */ +#define VG 46 /* round(0.71414 * 64) */ +#define VR 90 /* round(1.40200 * 64) */ + +// Y contribution to R,G,B. Scale and bias. +#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ +#define YB 32 /* 64 / 2 */ + +MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR) + +#undef YG +#undef YB +#undef UB +#undef UG +#undef VG +#undef VR + +// BT.709 limited range YUV to RGB reference +// R = (Y - 16) * 1.164 + V * 1.793 +// G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533 +// B = (Y - 16) * 1.164 + U * 2.112 +// KR = 0.2126, KB = 0.0722 + +// U and V contributions to R,G,B. +#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT709) +#define UB 135 /* round(2.112 * 64) */ +#else +#define UB 128 /* max(128, round(2.112 * 64)) */ +#endif +#define UG 14 /* round(0.213 * 64) */ +#define VG 34 /* round(0.533 * 64) */ +#define VR 115 /* round(1.793 * 64) */ + +// Y contribution to R,G,B. Scale and bias. +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ + +MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR) + +#undef YG +#undef YB +#undef UB +#undef UG +#undef VG +#undef VR + +// BT.709 full range YUV to RGB reference +// R = Y + V * 1.5748 +// G = Y - U * 0.18732 - V * 0.46812 +// B = Y + U * 1.8556 +// KR = 0.2126, KB = 0.0722 + +// U and V contributions to R,G,B. +#define UB 119 /* round(1.8556 * 64) */ +#define UG 12 /* round(0.18732 * 64) */ +#define VG 30 /* round(0.46812 * 64) */ +#define VR 101 /* round(1.5748 * 64) */ + +// Y contribution to R,G,B. Scale and bias. (same as jpeg) +#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */ +#define YB 32 /* 64 / 2 */ + +MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR) + +#undef YG +#undef YB +#undef UB +#undef UG +#undef VG +#undef VR + +// BT.2020 limited range YUV to RGB reference +// R = (Y - 16) * 1.164384 + V * 1.67867 +// G = (Y - 16) * 1.164384 - U * 0.187326 - V * 0.65042 +// B = (Y - 16) * 1.164384 + U * 2.14177 +// KR = 0.2627; KB = 0.0593 + +// U and V contributions to R,G,B. +#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT2020) +#define UB 137 /* round(2.142 * 64) */ +#else +#define UB 128 /* max(128, round(2.142 * 64)) */ +#endif +#define UG 12 /* round(0.187326 * 64) */ +#define VG 42 /* round(0.65042 * 64) */ +#define VR 107 /* round(1.67867 * 64) */ + +// Y contribution to R,G,B. Scale and bias. +#define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */ +#define YB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */ + +MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR) + +#undef YG +#undef YB +#undef UB +#undef UG +#undef VG +#undef VR + +// BT.2020 full range YUV to RGB reference +// R = Y + V * 1.474600 +// G = Y - U * 0.164553 - V * 0.571353 +// B = Y + U * 1.881400 +// KR = 0.2627; KB = 0.0593 + +#define UB 120 /* round(1.881400 * 64) */ +#define UG 11 /* round(0.164553 * 64) */ +#define VG 37 /* round(0.571353 * 64) */ +#define VR 94 /* round(1.474600 * 64) */ + +// Y contribution to R,G,B. Scale and bias. (same as jpeg) +#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */ +#define YB 32 /* 64 / 2 */ + +MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR) + +#undef YG +#undef YB +#undef UB +#undef UG +#undef VG +#undef VR + +#undef BB +#undef BG +#undef BR + +#undef MAKEYUVCONSTANTS + +#if defined(__aarch64__) || defined(__arm__) +#define LOAD_YUV_CONSTANTS \ + int ub = yuvconstants->kUVCoeff[0]; \ + int vr = yuvconstants->kUVCoeff[1]; \ + int ug = yuvconstants->kUVCoeff[2]; \ + int vg = yuvconstants->kUVCoeff[3]; \ + int yg = yuvconstants->kRGBCoeffBias[0]; \ + int bb = yuvconstants->kRGBCoeffBias[1]; \ + int bg = yuvconstants->kRGBCoeffBias[2]; \ + int br = yuvconstants->kRGBCoeffBias[3] + +#define CALC_RGB16 \ + int32_t y1 = (uint32_t)(y32 * yg) >> 16; \ + int b16 = y1 + (u * ub) - bb; \ + int g16 = y1 + bg - (u * ug + v * vg); \ + int r16 = y1 + (v * vr) - br +#else +#define LOAD_YUV_CONSTANTS \ + int ub = yuvconstants->kUVToB[0]; \ + int ug = yuvconstants->kUVToG[0]; \ + int vg = yuvconstants->kUVToG[1]; \ + int vr = yuvconstants->kUVToR[1]; \ + int yg = yuvconstants->kYToRgb[0]; \ + int yb = yuvconstants->kYBiasToRgb[0] + +#define CALC_RGB16 \ + int32_t y1 = ((uint32_t)(y32 * yg) >> 16) + yb; \ + int8_t ui = (int8_t)u; \ + int8_t vi = (int8_t)v; \ + ui -= 0x80; \ + vi -= 0x80; \ + int b16 = y1 + (ui * ub); \ + int g16 = y1 - (ui * ug + vi * vg); \ + int r16 = y1 + (vi * vr) +#endif + +void MirrorRow_C(const uint8_t *src, uint8_t *dst, int width) { + int x; + src += width - 1; + for (x = 0; x < width - 1; x += 2) { + dst[x] = src[0]; + dst[x + 1] = src[-1]; + src -= 2; + } + if (width & 1) { + dst[width - 1] = src[0]; + } +} + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits +// 16384 = 10 bits +// 4096 = 12 bits +// 256 = 16 bits +// TODO(fbarchard): change scale to bits +#define C16TO8(v, scale) clamp255(((v) * (scale)) >> 16) + +void CopyRow_C(const uint8_t *src, uint8_t *dst, int count) { + memcpy(dst, src, count); +} + +// Divide source RGB by alpha and store to destination. +// b = (b * 255 + (a / 2)) / a; +// g = (g * 255 + (a / 2)) / a; +// r = (r * 255 + (a / 2)) / a; +// Reciprocal method is off by 1 on some values. ie 125 +// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower. +#define T(a) 0x01000000 + (0x10000 / a) +const uint32_t fixed_invtbl8[256] = { + 0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), + T(0x07), T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), + T(0x0e), T(0x0f), T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), + T(0x15), T(0x16), T(0x17), T(0x18), T(0x19), T(0x1a), T(0x1b), + T(0x1c), T(0x1d), T(0x1e), T(0x1f), T(0x20), T(0x21), T(0x22), + T(0x23), T(0x24), T(0x25), T(0x26), T(0x27), T(0x28), T(0x29), + T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f), T(0x30), + T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37), + T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), + T(0x3f), T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), + T(0x46), T(0x47), T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), + T(0x4d), T(0x4e), T(0x4f), T(0x50), T(0x51), T(0x52), T(0x53), + T(0x54), T(0x55), T(0x56), T(0x57), T(0x58), T(0x59), T(0x5a), + T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f), T(0x60), T(0x61), + T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67), T(0x68), + T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f), + T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), + T(0x77), T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), + T(0x7e), T(0x7f), T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), + T(0x85), T(0x86), T(0x87), T(0x88), T(0x89), T(0x8a), T(0x8b), + T(0x8c), T(0x8d), T(0x8e), T(0x8f), T(0x90), T(0x91), T(0x92), + T(0x93), T(0x94), T(0x95), T(0x96), T(0x97), T(0x98), T(0x99), + T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f), T(0xa0), + T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7), + T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), + T(0xaf), T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), + T(0xb6), T(0xb7), T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), + T(0xbd), T(0xbe), T(0xbf), T(0xc0), T(0xc1), T(0xc2), T(0xc3), + T(0xc4), T(0xc5), T(0xc6), T(0xc7), T(0xc8), T(0xc9), T(0xca), + T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf), T(0xd0), T(0xd1), + T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7), T(0xd8), + T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf), + T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), + T(0xe7), T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), + T(0xee), T(0xef), T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), + T(0xf5), T(0xf6), T(0xf7), T(0xf8), T(0xf9), T(0xfa), T(0xfb), + T(0xfc), T(0xfd), T(0xfe), 0x01000100}; +#undef T + +// Blend 2 rows into 1. +static void HalfRow_C(const uint8_t *src_uv, + ptrdiff_t src_uv_stride, + uint8_t *dst_uv, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; + } +} + +// C version 2x2 -> 2x1. +void InterpolateRow_C(uint8_t *dst_ptr, + const uint8_t *src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint8_t *src_ptr1 = src_ptr + src_stride; + int x; + assert(source_y_fraction >= 0); + assert(source_y_fraction < 256); + + if (y1_fraction == 0) { + memcpy(dst_ptr, src_ptr, width); + return; + } + if (y1_fraction == 128) { + HalfRow_C(src_ptr, src_stride, dst_ptr, width); + return; + } + for (x = 0; x < width; ++x) { + dst_ptr[0] = STATIC_CAST( + uint8_t, + (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8); + ++src_ptr; + ++src_ptr1; + ++dst_ptr; + } +} + +// Work around GCC 7 punning warning -Wstrict-aliasing +#if defined(__GNUC__) +typedef uint32_t __attribute__((__may_alias__)) uint32_alias_t; +#else +typedef uint32_t uint32_alias_t; +#endif + +#undef STATIC_CAST diff --git a/pkg/encoder/yuv/libyuv/row_gcc.c b/pkg/encoder/yuv/libyuv/row_gcc.c new file mode 100644 index 000000000..07e795e60 --- /dev/null +++ b/pkg/encoder/yuv/libyuv/row_gcc.c @@ -0,0 +1,1090 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "row.h" + +// This module is for GCC x86 and x64. +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) + +#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) + +// Constants for ARGB +static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u, + 25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u}; + + +#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) + +#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) + +static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, + 112, -74, -38, 0, 112, -74, -38, 0}; + +static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0, + -18, -94, 112, 0, -18, -94, 112, 0}; + +// Constants for BGRA +static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u, + 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u}; + +static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112, + 0, -38, -74, 112, 0, -38, -74, 112}; + +static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18, + 0, 112, -94, -18, 0, 112, -94, -18}; + +// Constants for ABGR +static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u, + 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u}; + +static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0, + -38, -74, 112, 0, -38, -74, 112, 0}; + +static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0, + 112, -94, -18, 0, 112, -94, -18, 0}; + +// Constants for RGBA. +//static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u, +// 0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u}; + +static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, + 0, 112, -74, -38, 0, 112, -74, -38}; + +static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112, + 0, -18, -94, 112, 0, -18, -94, 112}; + +static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u, + 0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u}; + +static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; + +static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, + 0x8080u, 0x8080u, 0x8080u, 0x8080u}; + +#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) + +// clang-format off + +// TODO(mraptis): Consider passing R, G, B multipliers as parameter. +// round parameter is register containing value to add before shift. +#define RGBTOY(round) \ + "1: \n" \ + "movdqu (%0),%%xmm0 \n" \ + "movdqu 0x10(%0),%%xmm1 \n" \ + "movdqu 0x20(%0),%%xmm2 \n" \ + "movdqu 0x30(%0),%%xmm3 \n" \ + "psubb %%xmm5,%%xmm0 \n" \ + "psubb %%xmm5,%%xmm1 \n" \ + "psubb %%xmm5,%%xmm2 \n" \ + "psubb %%xmm5,%%xmm3 \n" \ + "movdqu %%xmm4,%%xmm6 \n" \ + "pmaddubsw %%xmm0,%%xmm6 \n" \ + "movdqu %%xmm4,%%xmm0 \n" \ + "pmaddubsw %%xmm1,%%xmm0 \n" \ + "movdqu %%xmm4,%%xmm1 \n" \ + "pmaddubsw %%xmm2,%%xmm1 \n" \ + "movdqu %%xmm4,%%xmm2 \n" \ + "pmaddubsw %%xmm3,%%xmm2 \n" \ + "lea 0x40(%0),%0 \n" \ + "phaddw %%xmm0,%%xmm6 \n" \ + "phaddw %%xmm2,%%xmm1 \n" \ + "prefetcht0 1280(%0) \n" \ + "paddw %%" #round ",%%xmm6 \n" \ + "paddw %%" #round ",%%xmm1 \n" \ + "psrlw $0x8,%%xmm6 \n" \ + "psrlw $0x8,%%xmm1 \n" \ + "packuswb %%xmm1,%%xmm6 \n" \ + "movdqu %%xmm6,(%1) \n" \ + "lea 0x10(%1),%1 \n" \ + "sub $0x10,%2 \n" \ + "jg 1b \n" + +#define RGBTOY_AVX2(round) \ + "1: \n" \ + "vmovdqu (%0),%%ymm0 \n" \ + "vmovdqu 0x20(%0),%%ymm1 \n" \ + "vmovdqu 0x40(%0),%%ymm2 \n" \ + "vmovdqu 0x60(%0),%%ymm3 \n" \ + "vpsubb %%ymm5, %%ymm0, %%ymm0 \n" \ + "vpsubb %%ymm5, %%ymm1, %%ymm1 \n" \ + "vpsubb %%ymm5, %%ymm2, %%ymm2 \n" \ + "vpsubb %%ymm5, %%ymm3, %%ymm3 \n" \ + "vpmaddubsw %%ymm0,%%ymm4,%%ymm0 \n" \ + "vpmaddubsw %%ymm1,%%ymm4,%%ymm1 \n" \ + "vpmaddubsw %%ymm2,%%ymm4,%%ymm2 \n" \ + "vpmaddubsw %%ymm3,%%ymm4,%%ymm3 \n" \ + "lea 0x80(%0),%0 \n" \ + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \ + "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \ + "prefetcht0 1280(%0) \n" \ + "vpaddw %%" #round ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \ + "vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \ + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" \ + "vpsrlw $0x8,%%ymm2,%%ymm2 \n" \ + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \ + "vpermd %%ymm0,%%ymm6,%%ymm0 \n" /* unmutate. */ \ + "vmovdqu %%ymm0,(%1) \n" \ + "lea 0x20(%1),%1 \n" \ + "sub $0x20,%2 \n" \ + "jg 1b \n" \ + "vzeroupper \n" + +// clang-format on + +#ifdef HAS_ARGBTOYROW_SSSE3 + +// Convert 16 ARGB pixels (64 bytes) to 16 Y values. +void ARGBToYRow_SSSE3(const uint8_t *src_argb, uint8_t *dst_y, int width) { + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + "movdqa %5,%%xmm7 \n" + + LABELALIGN RGBTOY(xmm7) + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kARGBToY), // %3 + "m"(kSub128), // %4 + "m"(kAddY16) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} + +#endif // HAS_ARGBTOYROW_SSSE3 + +#if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ABGRTOYROW_AVX2) || \ + defined(HAS_ARGBEXTRACTALPHAROW_AVX2) +// vpermd for vphaddw + vpackuswb vpermd. +static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; +#endif + +#ifdef HAS_ARGBTOYROW_AVX2 + +// Convert 32 ARGB pixels (128 bytes) to 32 Y values. +void ARGBToYRow_AVX2(const uint8_t *src_argb, uint8_t *dst_y, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + "vbroadcastf128 %5,%%ymm7 \n" + "vmovdqu %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( + ymm7) "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kARGBToY), // %3 + "m"(kSub128), // %4 + "m"(kAddY16), // %5 + "m"(kPermdARGBToY_AVX) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} + +#endif // HAS_ARGBTOYROW_AVX2 + +#ifdef HAS_ABGRTOYROW_AVX2 + +// Convert 32 ABGR pixels (128 bytes) to 32 Y values. +void ABGRToYRow_AVX2(const uint8_t *src_abgr, uint8_t *dst_y, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + "vbroadcastf128 %5,%%ymm7 \n" + "vmovdqu %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( + ymm7) "vzeroupper \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kABGRToY), // %3 + "m"(kSub128), // %4 + "m"(kAddY16), // %5 + "m"(kPermdARGBToY_AVX) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} + +#endif // HAS_ABGRTOYROW_AVX2 + +#ifdef HAS_ARGBTOUVROW_SSSE3 + +void ARGBToUVRow_SSSE3(const uint8_t *src_argb, + int src_stride_argb, + uint8_t *dst_u, + uint8_t *dst_v, + int width) { + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t) (src_stride_argb)), // %4 + "m"(kARGBToV), // %5 + "m"(kARGBToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); +} + +#endif // HAS_ARGBTOUVROW_SSSE3 + +#if defined(HAS_ARGBTOUVROW_AVX2) || defined(HAS_ABGRTOUVROW_AVX2) || \ + defined(HAS_ARGBTOUVJROW_AVX2) || defined(HAS_ABGRTOUVJROW_AVX2) +// vpshufb for vphaddw + vpackuswb packed to shorts. +static const lvec8 kShufARGBToUV_AVX = { + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; +#endif + +#if defined(HAS_ARGBTOUVROW_AVX2) + +void ARGBToUVRow_AVX2(const uint8_t *src_argb, + int src_stride_argb, + uint8_t *dst_u, + uint8_t *dst_v, + int width) { + asm volatile( + "vbroadcastf128 %5,%%ymm5 \n" + "vbroadcastf128 %6,%%ymm6 \n" + "vbroadcastf128 %7,%%ymm7 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" + "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" + "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" + + "vextractf128 $0x0,%%ymm0,(%1) \n" + "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t) (src_stride_argb)), // %4 + "m"(kAddUV128), // %5 + "m"(kARGBToV), // %6 + "m"(kARGBToU), // %7 + "m"(kShufARGBToUV_AVX) // %8 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} + +#endif // HAS_ARGBTOUVROW_AVX2 + +#ifdef HAS_ABGRTOUVROW_AVX2 + +void ABGRToUVRow_AVX2(const uint8_t *src_abgr, + int src_stride_abgr, + uint8_t *dst_u, + uint8_t *dst_v, + int width) { + asm volatile( + "vbroadcastf128 %5,%%ymm5 \n" + "vbroadcastf128 %6,%%ymm6 \n" + "vbroadcastf128 %7,%%ymm7 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" + "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" + "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" + + "vextractf128 $0x0,%%ymm0,(%1) \n" + "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_abgr), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t) (src_stride_abgr)), // %4 + "m"(kAddUV128), // %5 + "m"(kABGRToV), // %6 + "m"(kABGRToU), // %7 + "m"(kShufARGBToUV_AVX) // %8 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} + +#endif // HAS_ABGRTOUVROW_AVX2 + +void BGRAToYRow_SSSE3(const uint8_t *src_bgra, uint8_t *dst_y, int width) { + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + "movdqa %5,%%xmm7 \n" + + LABELALIGN RGBTOY(xmm7) + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kBGRAToY), // %3 + "m"(kSub128), // %4 + "m"(kAddY16) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} + +void BGRAToUVRow_SSSE3(const uint8_t *src_bgra, + int src_stride_bgra, + uint8_t *dst_u, + uint8_t *dst_v, + int width) { + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t) (src_stride_bgra)), // %4 + "m"(kBGRAToV), // %5 + "m"(kBGRAToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); +} + +void ABGRToYRow_SSSE3(const uint8_t *src_abgr, uint8_t *dst_y, int width) { + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + "movdqa %5,%%xmm7 \n" + + LABELALIGN RGBTOY(xmm7) + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kABGRToY), // %3 + "m"(kSub128), // %4 + "m"(kAddY16) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} + +void ABGRToUVRow_SSSE3(const uint8_t *src_abgr, + int src_stride_abgr, + uint8_t *dst_u, + uint8_t *dst_v, + int width) { + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t) (src_stride_abgr)), // %4 + "m"(kABGRToV), // %5 + "m"(kABGRToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); +} + +void RGBAToUVRow_SSSE3(const uint8_t *src_rgba, + int src_stride_rgba, + uint8_t *dst_u, + uint8_t *dst_v, + int width) { + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t) (src_stride_rgba)), // %4 + "m"(kRGBAToV), // %5 + "m"(kRGBAToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); +} + +#ifdef HAS_MIRRORROW_SSSE3 +// Shuffle table for reversing the bytes. +static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, + 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; + +void MirrorRow_SSSE3(const uint8_t *src, uint8_t *dst, int width) { + intptr_t temp_width = (intptr_t) (width); + asm volatile( + + "movdqa %3,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu -0x10(%0,%2,1),%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirror) // %3 + : "memory", "cc", "xmm0", "xmm5"); +} + +#endif // HAS_MIRRORROW_SSSE3 + +#ifdef HAS_MIRRORROW_AVX2 + +void MirrorRow_AVX2(const uint8_t *src, uint8_t *dst, int width) { + intptr_t temp_width = (intptr_t) (width); + asm volatile( + + "vbroadcastf128 %3,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu -0x20(%0,%2,1),%%ymm0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpermq $0x4e,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirror) // %3 + : "memory", "cc", "xmm0", "xmm5"); +} + +#endif // HAS_MIRRORROW_AVX2 + +#ifdef HAS_MIRRORUVROW_SSSE3 +// Shuffle table for reversing the UV. +static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, + 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u}; + +void MirrorUVRow_SSSE3(const uint8_t *src_uv, uint8_t *dst_uv, int width) { + intptr_t temp_width = (intptr_t) (width); + asm volatile( + + "movdqa %3,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu -0x10(%0,%2,2),%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_uv), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirrorUV) // %3 + : "memory", "cc", "xmm0", "xmm5"); +} + +#endif // HAS_MIRRORUVROW_SSSE3 + +#ifdef HAS_MIRRORUVROW_AVX2 + +void MirrorUVRow_AVX2(const uint8_t *src_uv, uint8_t *dst_uv, int width) { + intptr_t temp_width = (intptr_t) (width); + asm volatile( + + "vbroadcastf128 %3,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu -0x20(%0,%2,2),%%ymm0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpermq $0x4e,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uv), // %0 + "+r"(dst_uv), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirrorUV) // %3 + : "memory", "cc", "xmm0", "xmm5"); +} + +#endif // HAS_MIRRORUVROW_AVX2 + +#ifdef HAS_MIRRORSPLITUVROW_SSSE3 +// Shuffle table for reversing the bytes of UV channels. +static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, + 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; + +void MirrorSplitUVRow_SSSE3(const uint8_t *src, + uint8_t *dst_u, + uint8_t *dst_v, + int width) { + intptr_t temp_width = (intptr_t) (width); + asm volatile( + "movdqa %4,%%xmm1 \n" + "lea -0x10(%0,%3,2),%0 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea -0x10(%0),%0 \n" + "pshufb %%xmm1,%%xmm0 \n" + "movlpd %%xmm0,(%1) \n" + "movhpd %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $8,%3 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(temp_width) // %3 + : "m"(kShuffleMirrorSplitUV) // %4 + : "memory", "cc", "xmm0", "xmm1"); +} + +#endif // HAS_MIRRORSPLITUVROW_SSSE3 + +#ifdef HAS_ARGBMIRRORROW_SSE2 + +void ARGBMirrorRow_SSE2(const uint8_t *src, uint8_t *dst, int width) { + intptr_t temp_width = (intptr_t) (width); + asm volatile( + + "lea -0x10(%0,%2,4),%0 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "pshufd $0x1b,%%xmm0,%%xmm0 \n" + "lea -0x10(%0),%0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : + : "memory", "cc", "xmm0"); +} + +#endif // HAS_ARGBMIRRORROW_SSE2 + +#ifdef HAS_ARGBMIRRORROW_AVX2 +// Shuffle table for reversing the bytes. +static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; + +void ARGBMirrorRow_AVX2(const uint8_t *src, uint8_t *dst, int width) { + intptr_t temp_width = (intptr_t) (width); + asm volatile( + + "vmovdqu %3,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kARGBShuffleMirror_AVX2) // %3 + : "memory", "cc", "xmm0", "xmm5"); +} + +#endif // HAS_ARGBMIRRORROW_AVX2 + + +#ifdef HAS_COPYROW_SSE2 + +void CopyRow_SSE2(const uint8_t *src, uint8_t *dst, int width) { + asm volatile( + "test $0xf,%0 \n" + "jne 2f \n" + "test $0xf,%1 \n" + "jne 2f \n" + + LABELALIGN + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,(%1) \n" + "movdqa %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "jmp 9f \n" + + LABELALIGN + "2: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 2b \n" + + LABELALIGN "9: \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} + +#endif // HAS_COPYROW_SSE2 + +#ifdef HAS_COPYROW_AVX + +void CopyRow_AVX(const uint8_t *src, uint8_t *dst, int width) { + asm volatile( + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x40,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} + +#endif // HAS_COPYROW_AVX + +#ifdef HAS_COPYROW_ERMS + +// Multiple of 1. +void CopyRow_ERMS(const uint8_t *src, uint8_t *dst, int width) { + size_t width_tmp = (size_t) (width); + asm volatile( + + "rep movsb \n" + : "+S"(src), // %0 + "+D"(dst), // %1 + "+c"(width_tmp) // %2 + : + : "memory", "cc"); +} + +#endif // HAS_COPYROW_ERMS + +#ifdef HAS_INTERPOLATEROW_SSSE3 + +// Bilinear filter 16x2 -> 16x1 +void InterpolateRow_SSSE3(uint8_t *dst_ptr, + const uint8_t *src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction) { + asm volatile( + "sub %1,%0 \n" + "cmp $0x0,%3 \n" + "je 100f \n" + "cmp $0x80,%3 \n" + "je 50f \n" + + "movd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x100,%3 \n" + "movd %3,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x80808080,%%eax \n" + "movd %%eax,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + + // General purpose row blend. + LABELALIGN + "1: \n" + "movdqu (%1),%%xmm0 \n" + "movdqu 0x00(%1,%4,1),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "psubb %%xmm4,%%xmm0 \n" + "psubb %%xmm4,%%xmm1 \n" + "movdqa %%xmm5,%%xmm2 \n" + "movdqa %%xmm5,%%xmm3 \n" + "pmaddubsw %%xmm0,%%xmm2 \n" + "pmaddubsw %%xmm1,%%xmm3 \n" + "paddw %%xmm4,%%xmm2 \n" + "paddw %%xmm4,%%xmm3 \n" + "psrlw $0x8,%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "movdqu %%xmm2,0x00(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "jmp 99f \n" + + // Blend 50 / 50. + LABELALIGN + "50: \n" + "movdqu (%1),%%xmm0 \n" + "movdqu 0x00(%1,%4,1),%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,0x00(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 50b \n" + "jmp 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + LABELALIGN + "100: \n" + "movdqu (%1),%%xmm0 \n" + "movdqu %%xmm0,0x00(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+rm"(width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t) (src_stride)) // %4 + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +#endif // HAS_INTERPOLATEROW_SSSE3 + +#ifdef HAS_INTERPOLATEROW_AVX2 + +// Bilinear filter 32x2 -> 32x1 +void InterpolateRow_AVX2(uint8_t *dst_ptr, + const uint8_t *src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction) { + asm volatile( + "sub %1,%0 \n" + "cmp $0x0,%3 \n" + "je 100f \n" + "cmp $0x80,%3 \n" + "je 50f \n" + + "vmovd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x100,%3 \n" + "vmovd %3,%%xmm5 \n" + "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n" + "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n" + "vbroadcastss %%xmm5,%%ymm5 \n" + "mov $0x80808080,%%eax \n" + "vmovd %%eax,%%xmm4 \n" + "vbroadcastss %%xmm4,%%ymm4 \n" + + // General purpose row blend. + LABELALIGN + "1: \n" + "vmovdqu (%1),%%ymm0 \n" + "vmovdqu 0x00(%1,%4,1),%%ymm2 \n" + "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" + "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsubb %%ymm4,%%ymm1,%%ymm1 \n" + "vpsubb %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n" + "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n" + "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" + "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%1,%0,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "jmp 99f \n" + + // Blend 50 / 50. + LABELALIGN + "50: \n" + "vmovdqu (%1),%%ymm0 \n" + "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%1,%0,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 50b \n" + "jmp 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + LABELALIGN + "100: \n" + "vmovdqu (%1),%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%1,%0,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 100b \n" + + "99: \n" + "vzeroupper \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t) (src_stride)) // %4 + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"); +} + +#endif // HAS_INTERPOLATEROW_AVX2 + +#endif // defined(__x86_64__) || defined(__i386__) diff --git a/pkg/encoder/yuv/libyuv/scale.c b/pkg/encoder/yuv/libyuv/scale.c new file mode 100644 index 000000000..c4bd5b0b4 --- /dev/null +++ b/pkg/encoder/yuv/libyuv/scale.c @@ -0,0 +1,946 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "scale.h" + +#include +#include + +#include "cpu_id.h" +#include "planar_functions.h" // For CopyPlane +#include "row.h" +#include "scale_row.h" + +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) +#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s) + +// Scale plane, 1/2 +// This is an optimized version for scaling down a plane to 1/2 of +// its original size. + +static void ScalePlaneDown2(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t *src_ptr, + uint8_t *dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown2)(const uint8_t *src_ptr, ptrdiff_t src_stride, + uint8_t *dst_ptr, int dst_width) = + filtering == kFilterNone + ? ScaleRowDown2_C + : (filtering == kFilterLinear ? ScaleRowDown2Linear_C + : ScaleRowDown2Box_C); + int row_stride = src_stride * 2; + (void) src_width; + (void) src_height; + if (!filtering) { + src_ptr += src_stride; // Point to odd rows. + src_stride = 0; + } + + +#if defined(HAS_SCALEROWDOWN2_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_Any_SSSE3 + : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 + : ScaleRowDown2Box_Any_SSSE3); + if (IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_SSSE3 + : (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 + : ScaleRowDown2Box_SSSE3); + } + } +#endif +#if defined(HAS_SCALEROWDOWN2_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_Any_AVX2 + : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 + : ScaleRowDown2Box_Any_AVX2); + if (IS_ALIGNED(dst_width, 32)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 + : (filtering == kFilterLinear + ? ScaleRowDown2Linear_AVX2 + : ScaleRowDown2Box_AVX2); + } + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + // TODO(fbarchard): Loop through source height to allow odd height. + for (y = 0; y < dst_height; ++y) { + ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += row_stride; + dst_ptr += dst_stride; + } +} + +// Scale plane, 1/4 +// This is an optimized version for scaling down a plane to 1/4 of +// its original size. + +static void ScalePlaneDown4(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t *src_ptr, + uint8_t *dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown4)(const uint8_t *src_ptr, ptrdiff_t src_stride, + uint8_t *dst_ptr, int dst_width) = + filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C; + int row_stride = src_stride * 4; + (void) src_width; + (void) src_height; + if (!filtering) { + src_ptr += src_stride * 2; // Point to row 2. + src_stride = 0; + } + +#if defined(HAS_SCALEROWDOWN4_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3; + if (IS_ALIGNED(dst_width, 8)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3; + } + } +#endif +#if defined(HAS_SCALEROWDOWN4_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2; + if (IS_ALIGNED(dst_width, 16)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2; + } + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (y = 0; y < dst_height; ++y) { + ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += row_stride; + dst_ptr += dst_stride; + } +} + +// Scale plane down, 3/4 +static void ScalePlaneDown34(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t *src_ptr, + uint8_t *dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown34_0)(const uint8_t *src_ptr, ptrdiff_t src_stride, + uint8_t *dst_ptr, int dst_width); + void (*ScaleRowDown34_1)(const uint8_t *src_ptr, ptrdiff_t src_stride, + uint8_t *dst_ptr, int dst_width); + const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + (void) src_width; + (void) src_height; + assert(dst_width % 3 == 0); + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_C; + ScaleRowDown34_1 = ScaleRowDown34_C; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_C; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_C; + } + +#if defined(HAS_SCALEROWDOWN34_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3; + } + if (dst_width % 24 == 0) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_SSSE3; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3; + } + } + } +#endif + + for (y = 0; y < dst_height - 2; y += 3) { + ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 2; + dst_ptr += dst_stride; + } + + // Remainder 1 or 2 rows with last row vertically unfiltered + if ((dst_height % 3) == 2) { + ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width); + } else if ((dst_height % 3) == 1) { + ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width); + } +} + +// Scale plane, 3/8 +// This is an optimized version for scaling down a plane to 3/8 +// of its original size. +// +// Uses box filter arranges like this +// aaabbbcc -> abc +// aaabbbcc def +// aaabbbcc ghi +// dddeeeff +// dddeeeff +// dddeeeff +// ggghhhii +// ggghhhii +// Boxes are 3x3, 2x3, 3x2 and 2x2 + +static void ScalePlaneDown38(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t *src_ptr, + uint8_t *dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown38_3)(const uint8_t *src_ptr, ptrdiff_t src_stride, + uint8_t *dst_ptr, int dst_width); + void (*ScaleRowDown38_2)(const uint8_t *src_ptr, ptrdiff_t src_stride, + uint8_t *dst_ptr, int dst_width); + const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + assert(dst_width % 3 == 0); + (void) src_width; + (void) src_height; + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_C; + ScaleRowDown38_2 = ScaleRowDown38_C; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_C; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_C; + } + +#if defined(HAS_SCALEROWDOWN38_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3; + } + if (dst_width % 12 == 0 && !filtering) { + ScaleRowDown38_3 = ScaleRowDown38_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_SSSE3; + } + if (dst_width % 6 == 0 && filtering) { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3; + } + } +#endif + + for (y = 0; y < dst_height - 2; y += 3) { + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 2; + dst_ptr += dst_stride; + } + + // Remainder 1 or 2 rows with last row vertically unfiltered + if ((dst_height % 3) == 2) { + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); + } else if ((dst_height % 3) == 1) { + ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); + } +} + +#define MIN1(x) ((x) < 1 ? 1 : (x)) + +static __inline uint32_t SumPixels(int iboxwidth, const uint16_t *src_ptr) { + uint32_t sum = 0u; + int x; + assert(iboxwidth > 0); + for (x = 0; x < iboxwidth; ++x) { + sum += src_ptr[x]; + } + return sum; +} + +static __inline uint32_t SumPixels_16(int iboxwidth, const uint32_t *src_ptr) { + uint32_t sum = 0u; + int x; + assert(iboxwidth > 0); + for (x = 0; x < iboxwidth; ++x) { + sum += src_ptr[x]; + } + return sum; +} + +static void ScaleAddCols2_C(int dst_width, + int boxheight, + int x, + int dx, + const uint16_t *src_ptr, + uint8_t *dst_ptr) { + int i; + int scaletbl[2]; + int minboxwidth = dx >> 16; + int boxwidth; + scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight); + scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight); + for (i = 0; i < dst_width; ++i) { + int ix = x >> 16; + x += dx; + boxwidth = MIN1((x >> 16) - ix); + int scaletbl_index = boxwidth - minboxwidth; + assert((scaletbl_index == 0) || (scaletbl_index == 1)); + *dst_ptr++ = (uint8_t) (SumPixels(boxwidth, src_ptr + ix) * + scaletbl[scaletbl_index] >> + 16); + } +} + +static void ScaleAddCols0_C(int dst_width, + int boxheight, + int x, + int dx, + const uint16_t *src_ptr, + uint8_t *dst_ptr) { + int scaleval = 65536 / boxheight; + int i; + (void) dx; + src_ptr += (x >> 16); + for (i = 0; i < dst_width; ++i) { + *dst_ptr++ = (uint8_t) (src_ptr[i] * scaleval >> 16); + } +} + +static void ScaleAddCols1_C(int dst_width, + int boxheight, + int x, + int dx, + const uint16_t *src_ptr, + uint8_t *dst_ptr) { + int boxwidth = MIN1(dx >> 16); + int scaleval = 65536 / (boxwidth * boxheight); + int i; + x >>= 16; + for (i = 0; i < dst_width; ++i) { + *dst_ptr++ = (uint8_t) (SumPixels(boxwidth, src_ptr + x) * scaleval >> 16); + x += boxwidth; + } +} + +// Scale plane down to any dimensions, with interpolation. +// (boxfilter). +// +// Same method as SimpleScale, which is fixed point, outputting +// one pixel of destination using fixed point (16.16) to step +// through source, sampling a box of pixel with simple +// averaging. +static void ScalePlaneBox(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t *src_ptr, + uint8_t *dst_ptr) { + int j, k; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + const int max_y = (src_height << 16); + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y, + &dx, &dy); + src_width = Abs(src_width); + { + // Allocate a row buffer of uint16_t. + align_buffer_64(row16, src_width * 2); + void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, + const uint16_t *src_ptr, uint8_t *dst_ptr) = + (dx & 0xffff) ? ScaleAddCols2_C + : ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C); + void (*ScaleAddRow)(const uint8_t *src_ptr, uint16_t *dst_ptr, + int src_width) = ScaleAddRow_C; +#if defined(HAS_SCALEADDROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleAddRow = ScaleAddRow_Any_SSE2; + if (IS_ALIGNED(src_width, 16)) { + ScaleAddRow = ScaleAddRow_SSE2; + } + } +#endif +#if defined(HAS_SCALEADDROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleAddRow = ScaleAddRow_Any_AVX2; + if (IS_ALIGNED(src_width, 32)) { + ScaleAddRow = ScaleAddRow_AVX2; + } + } +#endif + + + for (j = 0; j < dst_height; ++j) { + int boxheight; + int iy = y >> 16; + const uint8_t *src = src_ptr + iy * (int64_t) src_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + boxheight = MIN1((y >> 16) - iy); + memset(row16, 0, src_width * 2); + for (k = 0; k < boxheight; ++k) { + ScaleAddRow(src, (uint16_t *) (row16), src_width); + src += src_stride; + } + ScaleAddCols(dst_width, boxheight, x, dx, (uint16_t *) (row16), dst_ptr); + dst_ptr += dst_stride; + } + free_aligned_buffer_64(row16); + } +} + +// Scale plane down with bilinear interpolation. +static void ScalePlaneBilinearDown(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t *src_ptr, + uint8_t *dst_ptr, + enum FilterMode filtering) { + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. + // Allocate a row buffer. + align_buffer_64(row, src_width); + + const int max_y = (src_height - 1) << 16; + int j; + void (*ScaleFilterCols)(uint8_t *dst_ptr, const uint8_t *src_ptr, + int dst_width, int x, int dx) = + (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C; + void (*InterpolateRow)(uint8_t *dst_ptr, const uint8_t *src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); + src_width = Abs(src_width); + +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(src_width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif + +#if defined(HAS_SCALEFILTERCOLS_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_SSSE3; + } +#endif + + if (y > max_y) { + y = max_y; + } + + for (j = 0; j < dst_height; ++j) { + int yi = y >> 16; + const uint8_t *src = src_ptr + yi * (int64_t) src_stride; + if (filtering == kFilterLinear) { + ScaleFilterCols(dst_ptr, src, dst_width, x, dx); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(row, src, src_stride, src_width, yf); + ScaleFilterCols(dst_ptr, row, dst_width, x, dx); + } + dst_ptr += dst_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + } + free_aligned_buffer_64(row); +} + +// Scale up down with bilinear interpolation. +static void ScalePlaneBilinearUp(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t *src_ptr, + uint8_t *dst_ptr, + enum FilterMode filtering) { + int j; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + const int max_y = (src_height - 1) << 16; + void (*InterpolateRow)(uint8_t *dst_ptr, const uint8_t *src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + void (*ScaleFilterCols)(uint8_t *dst_ptr, const uint8_t *src_ptr, + int dst_width, int x, int dx) = + filtering ? ScaleFilterCols_C : ScaleCols_C; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); + src_width = Abs(src_width); + +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif + + if (filtering && src_width >= 32768) { + ScaleFilterCols = ScaleFilterCols64_C; + } +#if defined(HAS_SCALEFILTERCOLS_SSSE3) + if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_SSSE3; + } +#endif + + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { + ScaleFilterCols = ScaleColsUp2_C; +#if defined(HAS_SCALECOLS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleFilterCols = ScaleColsUp2_SSE2; + } +#endif + } + + if (y > max_y) { + y = max_y; + } + { + int yi = y >> 16; + const uint8_t *src = src_ptr + yi * (int64_t) src_stride; + + // Allocate 2 row buffers. + const int row_size = (dst_width + 31) & ~31; + align_buffer_64(row, row_size * 2); + + uint8_t *rowptr = row; + int rowstride = row_size; + int lasty = yi; + + ScaleFilterCols(rowptr, src, dst_width, x, dx); + if (src_height > 1) { + src += src_stride; + } + ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx); + if (src_height > 2) { + src += src_stride; + } + + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lasty) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + src = src_ptr + yi * (int64_t) src_stride; + } + if (yi != lasty) { + ScaleFilterCols(rowptr, src, dst_width, x, dx); + rowptr += rowstride; + rowstride = -rowstride; + lasty = yi; + if ((y + 65536) < max_y) { + src += src_stride; + } + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf); + } + dst_ptr += dst_stride; + y += dy; + } + free_aligned_buffer_64(row); + } +} + +// Scale plane, horizontally up by 2 times. +// Uses linear filter horizontally, nearest vertically. +// This is an optimized version for scaling up a plane to 2 times of +// its original width, using linear interpolation. +// This is used to scale U and V planes of I422 to I444. +static void ScalePlaneUp2_Linear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t *src_ptr, + uint8_t *dst_ptr) { + void (*ScaleRowUp)(const uint8_t *src_ptr, uint8_t *dst_ptr, int dst_width) = + ScaleRowUp2_Linear_Any_C; + int i; + int y; + int dy; + + (void) src_width; + // This function can only scale up by 2 times horizontally. + assert(src_width == ((dst_width + 1) / 2)); + +#ifdef HAS_SCALEROWUP2_LINEAR_SSE2 + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2; + } +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3 + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3; + } +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp = ScaleRowUp2_Linear_Any_AVX2; + } +#endif + + + if (dst_height == 1) { + ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t) src_stride, dst_ptr, + dst_width); + } else { + dy = FixedDiv(src_height - 1, dst_height - 1); + y = (1 << 15) - 1; + for (i = 0; i < dst_height; ++i) { + ScaleRowUp(src_ptr + (y >> 16) * (int64_t) src_stride, dst_ptr, dst_width); + dst_ptr += dst_stride; + y += dy; + } + } +} + +// Scale plane, up by 2 times. +// This is an optimized version for scaling up a plane to 2 times of +// its original size, using bilinear interpolation. +// This is used to scale U and V planes of I420 to I444. +static void ScalePlaneUp2_Bilinear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t *src_ptr, + uint8_t *dst_ptr) { + void (*Scale2RowUp)(const uint8_t *src_ptr, ptrdiff_t src_stride, + uint8_t *dst_ptr, ptrdiff_t dst_stride, int dst_width) = + ScaleRowUp2_Bilinear_Any_C; + int x; + + (void) src_width; + // This function can only scale up by 2 times. + assert(src_width == ((dst_width + 1) / 2)); + assert(src_height == ((dst_height + 1) / 2)); + +#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2 + if (TestCpuFlag(kCpuHasSSE2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2; + } +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3 + if (TestCpuFlag(kCpuHasSSSE3)) { + Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3; + } +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2; + } +#endif + + + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + dst_ptr += dst_stride; + for (x = 0; x < src_height - 1; ++x) { + Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); + src_ptr += src_stride; + // TODO(fbarchard): Test performance of writing one row of destination at a + // time. + dst_ptr += 2 * dst_stride; + } + if (!(dst_height & 1)) { + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + } +} + +// Scale Plane to/from any dimensions, without interpolation. +// Fixed point math is used for performance: The upper 16 bits +// of x and dx is the integer part of the source position and +// the lower 16 bits are the fixed decimal part. + +static void ScalePlaneSimple(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t *src_ptr, + uint8_t *dst_ptr) { + int i; + void (*ScaleCols)(uint8_t *dst_ptr, const uint8_t *src_ptr, int dst_width, + int x, int dx) = ScaleCols_C; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y, + &dx, &dy); + src_width = Abs(src_width); + + if (src_width * 2 == dst_width && x < 0x8000) { + ScaleCols = ScaleColsUp2_C; +#if defined(HAS_SCALECOLS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleCols = ScaleColsUp2_SSE2; + } +#endif + } + + for (i = 0; i < dst_height; ++i) { + ScaleCols(dst_ptr, src_ptr + (y >> 16) * (int64_t) src_stride, dst_width, x, + dx); + dst_ptr += dst_stride; + y += dy; + } +} + +// Scale a plane. +// This function dispatches to a specialized scaler based on scale factor. +LIBYUV_API +void ScalePlane(const uint8_t *src, + int src_stride, + int src_width, + int src_height, + uint8_t *dst, + int dst_stride, + int dst_width, + int dst_height, + enum FilterMode filtering) { + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, + filtering); + + // Negative height means invert the image. + if (src_height < 0) { + src_height = -src_height; + src = src + (src_height - 1) * (int64_t) src_stride; + src_stride = -src_stride; + } + // Use specialized scales to improve performance for common resolutions. + // For example, all the 1/2 scalings will use ScalePlaneDown2() + if (dst_width == src_width && dst_height == src_height) { + // Straight copy. + CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height); + return; + } + if (dst_width == src_width && filtering != kFilterBox) { + int dy = 0; + int y = 0; + // When scaling down, use the center 2 rows to filter. + // When scaling up, last row of destination uses the last 2 source rows. + if (dst_height <= src_height) { + dy = FixedDiv(src_height, dst_height); + y = CENTERSTART(dy, -32768); // Subtract 0.5 (32768) to center filter. + } else if (src_height > 1 && dst_height > 1) { + dy = FixedDiv1(src_height, dst_height); + } + // Arbitrary scale vertically, but unscaled horizontally. + ScalePlaneVertical(src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, 0, y, dy, /*bpp=*/1, filtering); + return; + } + if (dst_width <= Abs(src_width) && dst_height <= src_height) { + // Scale down. + if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) { + // optimized, 3/4 + ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, filtering); + return; + } + if (2 * dst_width == src_width && 2 * dst_height == src_height) { + // optimized, 1/2 + ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, filtering); + return; + } + // 3/8 rounded up for odd sized chroma height. + if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) { + // optimized, 3/8 + ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, filtering); + return; + } + if (4 * dst_width == src_width && 4 * dst_height == src_height && + (filtering == kFilterBox || filtering == kFilterNone)) { + // optimized, 1/4 + ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, filtering); + return; + } + } + if (filtering == kFilterBox && dst_height * 2 < src_height) { + ScalePlaneBox(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst); + return; + } + if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) { + ScalePlaneUp2_Linear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } + if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width && + (filtering == kFilterBilinear || filtering == kFilterBox)) { + ScalePlaneUp2_Bilinear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } + if (filtering && dst_height > src_height) { + ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if (filtering) { + ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst); +} + +LIBYUV_API +int I420Scale(const uint8_t *src_y, + int src_stride_y, + const uint8_t *src_u, + int src_stride_u, + const uint8_t *src_v, + int src_stride_v, + int src_width, + int src_height, + uint8_t *dst_y, + int dst_stride_y, + uint8_t *dst_u, + int dst_stride_u, + uint8_t *dst_v, + int dst_stride_v, + int dst_width, + int dst_height, + enum FilterMode filtering) { + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int src_halfheight = SUBSAMPLE(src_height, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); + + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u, + dst_stride_u, dst_halfwidth, dst_halfheight, filtering); + ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v, + dst_stride_v, dst_halfwidth, dst_halfheight, filtering); + return 0; +} diff --git a/pkg/encoder/yuv/libyuv/scale.h b/pkg/encoder/yuv/libyuv/scale.h new file mode 100644 index 000000000..ed0a1983f --- /dev/null +++ b/pkg/encoder/yuv/libyuv/scale.h @@ -0,0 +1,53 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_SCALE_H_ +#define INCLUDE_LIBYUV_SCALE_H_ + +#include "basic_types.h" + +// Supported filtering. +typedef enum FilterMode { + kFilterNone = 0, // Point sample; Fastest. + kFilterLinear = 1, // Filter horizontally only. + kFilterBilinear = 2, // Faster than box, but lower quality scaling down. + kFilterBox = 3 // Highest quality. +} FilterModeEnum; + +// Scales a YUV 4:2:0 image from the src width and height to the +// dst width and height. +// If filtering is kFilterNone, a simple nearest-neighbor algorithm is +// used. This produces basic (blocky) quality at the fastest speed. +// If filtering is kFilterBilinear, interpolation is used to produce a better +// quality image, at the expense of speed. +// If filtering is kFilterBox, averaging is used to produce ever better +// quality image, at further expense of speed. +// Returns 0 if successful. + +LIBYUV_API +int I420Scale(const uint8_t *src_y, + int src_stride_y, + const uint8_t *src_u, + int src_stride_u, + const uint8_t *src_v, + int src_stride_v, + int src_width, + int src_height, + uint8_t *dst_y, + int dst_stride_y, + uint8_t *dst_u, + int dst_stride_u, + uint8_t *dst_v, + int dst_stride_v, + int dst_width, + int dst_height, + enum FilterMode filtering); + +#endif // INCLUDE_LIBYUV_SCALE_H_ diff --git a/pkg/encoder/yuv/libyuv/scale_any.c b/pkg/encoder/yuv/libyuv/scale_any.c new file mode 100644 index 000000000..f05e55b6e --- /dev/null +++ b/pkg/encoder/yuv/libyuv/scale_any.c @@ -0,0 +1,632 @@ +/* + * Copyright 2015 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "scale_row.h" + +// Fixed scale down. +// Mask may be non-power of 2, so use MOD +#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \ + int dst_width) { \ + int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */ \ + int n = dst_width - r; \ + if (n > 0) { \ + SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ + } \ + SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ + dst_ptr + n * BPP, r); \ + } + +// Fixed scale down for odd source width. Used by I420Blend subsampling. +// Since dst_width is (width + 1) / 2, this function scales one less pixel +// and copies the last pixel. +#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \ + int dst_width) { \ + int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */ \ + int n = (dst_width - 1) - r; \ + if (n > 0) { \ + SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ + } \ + SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ + dst_ptr + n * BPP, r + 1); \ + } + +#ifdef HAS_SCALEROWDOWN2_SSSE3 + +SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15) + +SDANY(ScaleRowDown2Linear_Any_SSSE3, + ScaleRowDown2Linear_SSSE3, + ScaleRowDown2Linear_C, + 2, + 1, + 15) + +SDANY(ScaleRowDown2Box_Any_SSSE3, + ScaleRowDown2Box_SSSE3, + ScaleRowDown2Box_C, + 2, + 1, + 15) + +SDODD(ScaleRowDown2Box_Odd_SSSE3, + ScaleRowDown2Box_SSSE3, + ScaleRowDown2Box_Odd_C, + 2, + 1, + 15) + +#endif +#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3 + +SDANY(ScaleUVRowDown2Box_Any_SSSE3, + ScaleUVRowDown2Box_SSSE3, + ScaleUVRowDown2Box_C, + 2, + 2, + 3) + +#endif +#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2 + +SDANY(ScaleUVRowDown2Box_Any_AVX2, + ScaleUVRowDown2Box_AVX2, + ScaleUVRowDown2Box_C, + 2, + 2, + 7) + +#endif +#ifdef HAS_SCALEROWDOWN2_AVX2 + +SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31) + +SDANY(ScaleRowDown2Linear_Any_AVX2, + ScaleRowDown2Linear_AVX2, + ScaleRowDown2Linear_C, + 2, + 1, + 31) + +SDANY(ScaleRowDown2Box_Any_AVX2, + ScaleRowDown2Box_AVX2, + ScaleRowDown2Box_C, + 2, + 1, + 31) + +SDODD(ScaleRowDown2Box_Odd_AVX2, + ScaleRowDown2Box_AVX2, + ScaleRowDown2Box_Odd_C, + 2, + 1, + 31) + +#endif +#ifdef HAS_SCALEROWDOWN4_SSSE3 + +SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7) + +SDANY(ScaleRowDown4Box_Any_SSSE3, + ScaleRowDown4Box_SSSE3, + ScaleRowDown4Box_C, + 4, + 1, + 7) + +#endif +#ifdef HAS_SCALEROWDOWN4_AVX2 + +SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15) + +SDANY(ScaleRowDown4Box_Any_AVX2, + ScaleRowDown4Box_AVX2, + ScaleRowDown4Box_C, + 4, + 1, + 15) + +#endif +#ifdef HAS_SCALEROWDOWN34_SSSE3 + +SDANY(ScaleRowDown34_Any_SSSE3, + ScaleRowDown34_SSSE3, + ScaleRowDown34_C, + 4 / 3, + 1, + 23) + +SDANY(ScaleRowDown34_0_Box_Any_SSSE3, + ScaleRowDown34_0_Box_SSSE3, + ScaleRowDown34_0_Box_C, + 4 / 3, + 1, + 23) + +SDANY(ScaleRowDown34_1_Box_Any_SSSE3, + ScaleRowDown34_1_Box_SSSE3, + ScaleRowDown34_1_Box_C, + 4 / 3, + 1, + 23) + +#endif + +#ifdef HAS_SCALEROWDOWN38_SSSE3 + +SDANY(ScaleRowDown38_Any_SSSE3, + ScaleRowDown38_SSSE3, + ScaleRowDown38_C, + 8 / 3, + 1, + 11) + +SDANY(ScaleRowDown38_3_Box_Any_SSSE3, + ScaleRowDown38_3_Box_SSSE3, + ScaleRowDown38_3_Box_C, + 8 / 3, + 1, + 5) + +SDANY(ScaleRowDown38_2_Box_Any_SSSE3, + ScaleRowDown38_2_Box_SSSE3, + ScaleRowDown38_2_Box_C, + 8 / 3, + 1, + 5) + +#endif + + +#undef SDANY + +// Scale down by even scale factor. +#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, \ + uint8_t* dst_ptr, int dst_width) { \ + int r = dst_width & MASK; \ + int n = dst_width & ~MASK; \ + if (n > 0) { \ + SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \ + } \ + SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx, \ + dst_ptr + n * BPP, r); \ + } + + + +#ifdef SASIMDONLY +// This also works and uses memcpy and SIMD instead of C, but is slower on ARM + +// Add rows box filter scale down. Using macro from row_any +#define SAROW(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint16_t dst_temp[32]); \ + SIMD_ALIGNED(uint8_t src_temp[32]); \ + memset(dst_temp, 0, 32 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, n); \ + } \ + memcpy(src_temp, src_ptr + n * SBPP, r * SBPP); \ + memcpy(dst_temp, dst_ptr + n * BPP, r * BPP); \ + ANY_SIMD(src_temp, dst_temp, MASK + 1); \ + memcpy(dst_ptr + n * BPP, dst_temp, r * BPP); \ + } + +#ifdef HAS_SCALEADDROW_SSE2 +SAROW(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, 1, 2, 15) +#endif +#ifdef HAS_SCALEADDROW_AVX2 +SAROW(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, 1, 2, 31) +#endif +#undef SAANY + +#else + +// Add rows box filter scale down. +#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \ + int n = src_width & ~MASK; \ + if (n > 0) { \ + SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \ + } \ + SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \ + } + +#ifdef HAS_SCALEADDROW_SSE2 + +SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15) + +#endif +#ifdef HAS_SCALEADDROW_AVX2 + +SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31) + +#endif +#undef SAANY + +#endif // SASIMDONLY + +// Scale up horizontally 2 times using linear filter. +#define SUH2LANY(NAME, SIMD, C, MASK, PTYPE) \ + void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \ + int work_width = (dst_width - 1) & ~1; \ + int r = work_width & MASK; \ + int n = work_width & ~MASK; \ + dst_ptr[0] = src_ptr[0]; \ + if (work_width > 0) { \ + if (n != 0) { \ + SIMD(src_ptr, dst_ptr + 1, n); \ + } \ + C(src_ptr + (n / 2), dst_ptr + n + 1, r); \ + } \ + dst_ptr[dst_width - 1] = src_ptr[(dst_width - 1) / 2]; \ + } + +// Even the C versions need to be wrapped, because boundary pixels have to +// be handled differently + +SUH2LANY(ScaleRowUp2_Linear_Any_C, + ScaleRowUp2_Linear_C, + ScaleRowUp2_Linear_C, + 0, + uint8_t) + +SUH2LANY(ScaleRowUp2_Linear_16_Any_C, + ScaleRowUp2_Linear_16_C, + ScaleRowUp2_Linear_16_C, + 0, + uint16_t) + +#ifdef HAS_SCALEROWUP2_LINEAR_SSE2 + +SUH2LANY(ScaleRowUp2_Linear_Any_SSE2, + ScaleRowUp2_Linear_SSE2, + ScaleRowUp2_Linear_C, + 15, + uint8_t) + +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3 + +SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3, + ScaleRowUp2_Linear_SSSE3, + ScaleRowUp2_Linear_C, + 15, + uint8_t) + +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3 + +SUH2LANY(ScaleRowUp2_Linear_12_Any_SSSE3, + ScaleRowUp2_Linear_12_SSSE3, + ScaleRowUp2_Linear_16_C, + 15, + uint16_t) + +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2 + +SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2, + ScaleRowUp2_Linear_16_SSE2, + ScaleRowUp2_Linear_16_C, + 7, + uint16_t) + +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_AVX2 + +SUH2LANY(ScaleRowUp2_Linear_Any_AVX2, + ScaleRowUp2_Linear_AVX2, + ScaleRowUp2_Linear_C, + 31, + uint8_t) + +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2 + +SUH2LANY(ScaleRowUp2_Linear_12_Any_AVX2, + ScaleRowUp2_Linear_12_AVX2, + ScaleRowUp2_Linear_16_C, + 31, + uint16_t) + +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2 + +SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2, + ScaleRowUp2_Linear_16_AVX2, + ScaleRowUp2_Linear_16_C, + 15, + uint16_t) + +#endif +#undef SUH2LANY + +// Scale up 2 times using bilinear filter. +// This function produces 2 rows at a time. +#define SU2BLANY(NAME, SIMD, C, MASK, PTYPE) \ + void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \ + ptrdiff_t dst_stride, int dst_width) { \ + int work_width = (dst_width - 1) & ~1; \ + int r = work_width & MASK; \ + int n = work_width & ~MASK; \ + const PTYPE* sa = src_ptr; \ + const PTYPE* sb = src_ptr + src_stride; \ + PTYPE* da = dst_ptr; \ + PTYPE* db = dst_ptr + dst_stride; \ + da[0] = (3 * sa[0] + sb[0] + 2) >> 2; \ + db[0] = (sa[0] + 3 * sb[0] + 2) >> 2; \ + if (work_width > 0) { \ + if (n != 0) { \ + SIMD(sa, sb - sa, da + 1, db - da, n); \ + } \ + C(sa + (n / 2), sb - sa, da + n + 1, db - da, r); \ + } \ + da[dst_width - 1] = \ + (3 * sa[(dst_width - 1) / 2] + sb[(dst_width - 1) / 2] + 2) >> 2; \ + db[dst_width - 1] = \ + (sa[(dst_width - 1) / 2] + 3 * sb[(dst_width - 1) / 2] + 2) >> 2; \ + } + +SU2BLANY(ScaleRowUp2_Bilinear_Any_C, + ScaleRowUp2_Bilinear_C, + ScaleRowUp2_Bilinear_C, + 0, + uint8_t) + +SU2BLANY(ScaleRowUp2_Bilinear_16_Any_C, + ScaleRowUp2_Bilinear_16_C, + ScaleRowUp2_Bilinear_16_C, + 0, + uint16_t) + +#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2 + +SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2, + ScaleRowUp2_Bilinear_SSE2, + ScaleRowUp2_Bilinear_C, + 15, + uint8_t) + +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3 + +SU2BLANY(ScaleRowUp2_Bilinear_12_Any_SSSE3, + ScaleRowUp2_Bilinear_12_SSSE3, + ScaleRowUp2_Bilinear_16_C, + 15, + uint16_t) + +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2 + +SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2, + ScaleRowUp2_Bilinear_16_SSE2, + ScaleRowUp2_Bilinear_16_C, + 7, + uint16_t) + +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3 + +SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3, + ScaleRowUp2_Bilinear_SSSE3, + ScaleRowUp2_Bilinear_C, + 15, + uint8_t) + +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2 + +SU2BLANY(ScaleRowUp2_Bilinear_Any_AVX2, + ScaleRowUp2_Bilinear_AVX2, + ScaleRowUp2_Bilinear_C, + 31, + uint8_t) + +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2 + +SU2BLANY(ScaleRowUp2_Bilinear_12_Any_AVX2, + ScaleRowUp2_Bilinear_12_AVX2, + ScaleRowUp2_Bilinear_16_C, + 15, + uint16_t) + +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2 + +SU2BLANY(ScaleRowUp2_Bilinear_16_Any_AVX2, + ScaleRowUp2_Bilinear_16_AVX2, + ScaleRowUp2_Bilinear_16_C, + 15, + uint16_t) + +#endif + +#undef SU2BLANY + +// Scale bi-planar plane up horizontally 2 times using linear filter. +#define SBUH2LANY(NAME, SIMD, C, MASK, PTYPE) \ + void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \ + int work_width = (dst_width - 1) & ~1; \ + int r = work_width & MASK; \ + int n = work_width & ~MASK; \ + dst_ptr[0] = src_ptr[0]; \ + dst_ptr[1] = src_ptr[1]; \ + if (work_width > 0) { \ + if (n != 0) { \ + SIMD(src_ptr, dst_ptr + 2, n); \ + } \ + C(src_ptr + n, dst_ptr + 2 * n + 2, r); \ + } \ + dst_ptr[2 * dst_width - 2] = src_ptr[((dst_width + 1) & ~1) - 2]; \ + dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1]; \ + } + +SBUH2LANY(ScaleUVRowUp2_Linear_Any_C, + ScaleUVRowUp2_Linear_C, + ScaleUVRowUp2_Linear_C, + 0, + uint8_t) + +SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_C, + ScaleUVRowUp2_Linear_16_C, + ScaleUVRowUp2_Linear_16_C, + 0, + uint16_t) + +#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3 + +SBUH2LANY(ScaleUVRowUp2_Linear_Any_SSSE3, + ScaleUVRowUp2_Linear_SSSE3, + ScaleUVRowUp2_Linear_C, + 7, + uint8_t) + +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2 + +SBUH2LANY(ScaleUVRowUp2_Linear_Any_AVX2, + ScaleUVRowUp2_Linear_AVX2, + ScaleUVRowUp2_Linear_C, + 15, + uint8_t) + +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41 + +SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_SSE41, + ScaleUVRowUp2_Linear_16_SSE41, + ScaleUVRowUp2_Linear_16_C, + 3, + uint16_t) + +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2 + +SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_AVX2, + ScaleUVRowUp2_Linear_16_AVX2, + ScaleUVRowUp2_Linear_16_C, + 7, + uint16_t) + +#endif + +#undef SBUH2LANY + +// Scale bi-planar plane up 2 times using bilinear filter. +// This function produces 2 rows at a time. +#define SBU2BLANY(NAME, SIMD, C, MASK, PTYPE) \ + void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \ + ptrdiff_t dst_stride, int dst_width) { \ + int work_width = (dst_width - 1) & ~1; \ + int r = work_width & MASK; \ + int n = work_width & ~MASK; \ + const PTYPE* sa = src_ptr; \ + const PTYPE* sb = src_ptr + src_stride; \ + PTYPE* da = dst_ptr; \ + PTYPE* db = dst_ptr + dst_stride; \ + da[0] = (3 * sa[0] + sb[0] + 2) >> 2; \ + db[0] = (sa[0] + 3 * sb[0] + 2) >> 2; \ + da[1] = (3 * sa[1] + sb[1] + 2) >> 2; \ + db[1] = (sa[1] + 3 * sb[1] + 2) >> 2; \ + if (work_width > 0) { \ + if (n != 0) { \ + SIMD(sa, sb - sa, da + 2, db - da, n); \ + } \ + C(sa + n, sb - sa, da + 2 * n + 2, db - da, r); \ + } \ + da[2 * dst_width - 2] = (3 * sa[((dst_width + 1) & ~1) - 2] + \ + sb[((dst_width + 1) & ~1) - 2] + 2) >> \ + 2; \ + db[2 * dst_width - 2] = (sa[((dst_width + 1) & ~1) - 2] + \ + 3 * sb[((dst_width + 1) & ~1) - 2] + 2) >> \ + 2; \ + da[2 * dst_width - 1] = (3 * sa[((dst_width + 1) & ~1) - 1] + \ + sb[((dst_width + 1) & ~1) - 1] + 2) >> \ + 2; \ + db[2 * dst_width - 1] = (sa[((dst_width + 1) & ~1) - 1] + \ + 3 * sb[((dst_width + 1) & ~1) - 1] + 2) >> \ + 2; \ + } + +SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_C, + ScaleUVRowUp2_Bilinear_C, + ScaleUVRowUp2_Bilinear_C, + 0, + uint8_t) + +SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_C, + ScaleUVRowUp2_Bilinear_16_C, + ScaleUVRowUp2_Bilinear_16_C, + 0, + uint16_t) + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3 + +SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_SSSE3, + ScaleUVRowUp2_Bilinear_SSSE3, + ScaleUVRowUp2_Bilinear_C, + 7, + uint8_t) + +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2 + +SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_AVX2, + ScaleUVRowUp2_Bilinear_AVX2, + ScaleUVRowUp2_Bilinear_C, + 15, + uint8_t) + +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 + +SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_SSE41, + ScaleUVRowUp2_Bilinear_16_SSE41, + ScaleUVRowUp2_Bilinear_16_C, + 7, + uint16_t) + +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2 + +SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_AVX2, + ScaleUVRowUp2_Bilinear_16_AVX2, + ScaleUVRowUp2_Bilinear_16_C, + 7, + uint16_t) + +#endif + +#undef SBU2BLANY diff --git a/pkg/encoder/yuv/libyuv/scale_common.c b/pkg/encoder/yuv/libyuv/scale_common.c new file mode 100644 index 000000000..17eedd992 --- /dev/null +++ b/pkg/encoder/yuv/libyuv/scale_common.c @@ -0,0 +1,930 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "scale.h" + +#include + +#include "cpu_id.h" +#include "row.h" +#include "scale_row.h" + +#define STATIC_CAST(type, expr) (type)(expr) + +// TODO(fbarchard): make clamp255 preserve negative values. +static __inline int32_t clamp255(int32_t v) { + return (-(v >= 255) | v) & 255; +} + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits +// 16384 = 10 bits +// 4096 = 12 bits +// 256 = 16 bits +// TODO(fbarchard): change scale to bits +#define C16TO8(v, scale) clamp255(((v) * (scale)) >> 16) + +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +// CPU agnostic row functions +void ScaleRowDown2_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst, + int dst_width) { + int x; + (void) src_stride; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src_ptr[1]; + dst[1] = src_ptr[3]; + dst += 2; + src_ptr += 4; + } + if (dst_width & 1) { + dst[0] = src_ptr[1]; + } +} + +void ScaleRowDown2Linear_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst, + int dst_width) { + const uint8_t *s = src_ptr; + int x; + (void) src_stride; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (s[0] + s[1] + 1) >> 1; + dst[1] = (s[2] + s[3] + 1) >> 1; + dst += 2; + s += 4; + } + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + 1) >> 1; + } +} + +void ScaleRowDown2Box_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst, + int dst_width) { + const uint8_t *s = src_ptr; + const uint8_t *t = src_ptr + src_stride; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; + dst += 2; + s += 4; + t += 4; + } + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + } +} + +void ScaleRowDown2Box_Odd_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst, + int dst_width) { + const uint8_t *s = src_ptr; + const uint8_t *t = src_ptr + src_stride; + int x; + dst_width -= 1; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; + dst += 2; + s += 4; + t += 4; + } + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + dst += 1; + s += 2; + t += 2; + } + dst[0] = (s[0] + t[0] + 1) >> 1; +} + +void ScaleRowDown4_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst, + int dst_width) { + int x; + (void) src_stride; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src_ptr[2]; + dst[1] = src_ptr[6]; + dst += 2; + src_ptr += 8; + } + if (dst_width & 1) { + dst[0] = src_ptr[2]; + } +} + +void ScaleRowDown4Box_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst, + int dst_width) { + intptr_t stride = src_stride; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> + 4; + dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] + + src_ptr[stride + 7] + src_ptr[stride * 2 + 4] + + src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] + + src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] + + src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] + + src_ptr[stride * 3 + 7] + 8) >> + 4; + dst += 2; + src_ptr += 8; + } + if (dst_width & 1) { + dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> + 4; + } +} + +void ScaleRowDown34_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst, + int dst_width) { + int x; + (void) src_stride; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[1]; + dst[2] = src_ptr[3]; + dst += 3; + src_ptr += 4; + } +} + +// Filter rows 0 and 1 together, 3 : 1 +void ScaleRowDown34_0_Box_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *d, + int dst_width) { + const uint8_t *s = src_ptr; + const uint8_t *t = src_ptr + src_stride; + int x; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 * 3 + b0 + 2) >> 2; + d[1] = (a1 * 3 + b1 + 2) >> 2; + d[2] = (a2 * 3 + b2 + 2) >> 2; + d += 3; + s += 4; + t += 4; + } +} + +// Filter rows 1 and 2 together, 1 : 1 +void ScaleRowDown34_1_Box_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *d, + int dst_width) { + const uint8_t *s = src_ptr; + const uint8_t *t = src_ptr + src_stride; + int x; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 + b0 + 1) >> 1; + d[1] = (a1 + b1 + 1) >> 1; + d[2] = (a2 + b2 + 1) >> 1; + d += 3; + s += 4; + t += 4; + } +} + +// Sample position: (O is src sample position, X is dst sample position) +// +// v dst_ptr at here v stop at here +// X O X X O X X O X X O X X O X +// ^ src_ptr at here +void ScaleRowUp2_Linear_C(const uint8_t *src_ptr, + uint8_t *dst_ptr, + int dst_width) { + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2; + dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2; + } +} + +// Sample position: (O is src sample position, X is dst sample position) +// +// src_ptr at here +// X v X X X X X X X X X +// O O O O O +// X X X X X X X X X X +// ^ dst_ptr at here ^ stop at here +// X X X X X X X X X X +// O O O O O +// X X X X X X X X X X +void ScaleRowUp2_Bilinear_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint8_t *s = src_ptr; + const uint8_t *t = src_ptr + src_stride; + uint8_t *d = dst_ptr; + uint8_t *e = dst_ptr + dst_stride; + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + d[2 * x + 0] = + (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4; + d[2 * x + 1] = + (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4; + e[2 * x + 0] = + (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4; + e[2 * x + 1] = + (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4; + } +} + +// Only suitable for at most 14 bit range. +void ScaleRowUp2_Linear_16_C(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int dst_width) { + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2; + dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2; + } +} + +// Only suitable for at most 12bit range. +void ScaleRowUp2_Bilinear_16_C(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint16_t *s = src_ptr; + const uint16_t *t = src_ptr + src_stride; + uint16_t *d = dst_ptr; + uint16_t *e = dst_ptr + dst_stride; + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + d[2 * x + 0] = + (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4; + d[2 * x + 1] = + (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4; + e[2 * x + 0] = + (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4; + e[2 * x + 1] = + (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4; + } +} + +// (1-f)a + fb can be replaced with a + f(b-a) +#if defined(__arm__) || defined(__aarch64__) +#define BLENDER(a, b, f) \ + (uint8_t)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) +#else +// Intel uses 7 bit math with rounding. +#define BLENDER(a, b, f) \ + (uint8_t)((int)(a) + (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7)) +#endif + +void ScaleFilterCols_C(uint8_t *dst_ptr, + const uint8_t *src_ptr, + int dst_width, + int x, + int dx) { + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + x += dx; + xi = x >> 16; + a = src_ptr[xi]; + b = src_ptr[xi + 1]; + dst_ptr[1] = BLENDER(a, b, x & 0xffff); + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + int xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + } +} + +void ScaleFilterCols64_C(uint8_t *dst_ptr, + const uint8_t *src_ptr, + int dst_width, + int x32, + int dx) { + int64_t x = (int64_t) (x32); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int64_t xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + x += dx; + xi = x >> 16; + a = src_ptr[xi]; + b = src_ptr[xi + 1]; + dst_ptr[1] = BLENDER(a, b, x & 0xffff); + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + int64_t xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + } +} + +#undef BLENDER + +// Same as 8 bit arm blender but return is cast to uint16_t +#define BLENDER(a, b, f) \ + (uint16_t)( \ + (int)(a) + \ + (int)((((int64_t)((f)) * ((int64_t)(b) - (int)(a))) + 0x8000) >> 16)) +#undef BLENDER + +void ScaleRowDown38_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst, + int dst_width) { + int x; + (void) src_stride; + assert(dst_width % 3 == 0); + for (x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[3]; + dst[2] = src_ptr[6]; + dst += 3; + src_ptr += 8; + } +} + +// 8x3 -> 3x1 +void ScaleRowDown38_3_Box_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width) { + intptr_t stride = src_stride; + int i; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (i = 0; i < dst_width; i += 3) { + dst_ptr[0] = + (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * + (65536 / 9) >> + 16; + dst_ptr[1] = + (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * + (65536 / 9) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * + (65536 / 6) >> + 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +// 8x2 -> 3x1 +void ScaleRowDown38_2_Box_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width) { + intptr_t stride = src_stride; + int i; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (i = 0; i < dst_width; i += 3) { + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2]) * + (65536 / 6) >> + 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5]) * + (65536 / 6) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) * + (65536 / 4) >> + 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +void ScaleAddRow_C(const uint8_t *src_ptr, uint16_t *dst_ptr, int src_width) { + int x; + assert(src_width > 0); + for (x = 0; x < src_width - 1; x += 2) { + dst_ptr[0] += src_ptr[0]; + dst_ptr[1] += src_ptr[1]; + src_ptr += 2; + dst_ptr += 2; + } + if (src_width & 1) { + dst_ptr[0] += src_ptr[0]; + } +} + +// UV scale row functions +// same as ARGB but 2 channels + +void ScaleUVRowDown2_C(const uint8_t *src_uv, + ptrdiff_t src_stride, + uint8_t *dst_uv, + int dst_width) { + int x; + (void) src_stride; + for (x = 0; x < dst_width; ++x) { + dst_uv[0] = src_uv[2]; // Store the 2nd UV + dst_uv[1] = src_uv[3]; + src_uv += 4; + dst_uv += 2; + } +} + +void ScaleUVRowDown2Linear_C(const uint8_t *src_uv, + ptrdiff_t src_stride, + uint8_t *dst_uv, + int dst_width) { + int x; + (void) src_stride; + for (x = 0; x < dst_width; ++x) { + dst_uv[0] = (src_uv[0] + src_uv[2] + 1) >> 1; + dst_uv[1] = (src_uv[1] + src_uv[3] + 1) >> 1; + src_uv += 4; + dst_uv += 2; + } +} + +void ScaleUVRowDown2Box_C(const uint8_t *src_uv, + ptrdiff_t src_stride, + uint8_t *dst_uv, + int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] + + src_uv[src_stride + 2] + 2) >> + 2; + dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] + + src_uv[src_stride + 3] + 2) >> + 2; + src_uv += 4; + dst_uv += 2; + } +} + +void ScaleUVRowDownEven_C(const uint8_t *src_uv, + ptrdiff_t src_stride, + int src_stepx, + uint8_t *dst_uv, + int dst_width) { + const uint16_t *src = (const uint16_t *) (src_uv); + uint16_t *dst = (uint16_t *) (dst_uv); + (void) src_stride; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src[0]; + dst[1] = src[src_stepx]; + src += src_stepx * 2; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[0]; + } +} + +// Scales a single row of pixels using point sampling. +void ScaleCols_C(uint8_t *dst_ptr, + const uint8_t *src_ptr, + int dst_width, + int x, + int dx) { + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst_ptr[0] = src_ptr[x >> 16]; + x += dx; + dst_ptr[1] = src_ptr[x >> 16]; + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + dst_ptr[0] = src_ptr[x >> 16]; + } +} + +// Scales a single row of pixels up by 2x using point sampling. +void ScaleColsUp2_C(uint8_t *dst_ptr, + const uint8_t *src_ptr, + int dst_width, + int x, + int dx) { + int j; + (void) x; + (void) dx; + for (j = 0; j < dst_width - 1; j += 2) { + dst_ptr[1] = dst_ptr[0] = src_ptr[0]; + src_ptr += 1; + dst_ptr += 2; + } + if (dst_width & 1) { + dst_ptr[0] = src_ptr[0]; + } +} + +void ScaleUVRowUp2_Linear_C(const uint8_t *src_ptr, + uint8_t *dst_ptr, + int dst_width) { + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + dst_ptr[4 * x + 0] = + (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2; + dst_ptr[4 * x + 1] = + (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2; + dst_ptr[4 * x + 2] = + (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2; + dst_ptr[4 * x + 3] = + (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2; + } +} + +void ScaleUVRowUp2_Bilinear_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint8_t *s = src_ptr; + const uint8_t *t = src_ptr + src_stride; + uint8_t *d = dst_ptr; + uint8_t *e = dst_ptr + dst_stride; + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + + t[2 * x + 2] * 1 + 8) >> + 4; + d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + + t[2 * x + 3] * 1 + 8) >> + 4; + d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 + + t[2 * x + 2] * 3 + 8) >> + 4; + d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 + + t[2 * x + 3] * 3 + 8) >> + 4; + e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 + + t[2 * x + 2] * 3 + 8) >> + 4; + e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 + + t[2 * x + 3] * 3 + 8) >> + 4; + e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + + t[2 * x + 2] * 9 + 8) >> + 4; + e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + + t[2 * x + 3] * 9 + 8) >> + 4; + } +} + +void ScaleUVRowUp2_Linear_16_C(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int dst_width) { + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + dst_ptr[4 * x + 0] = + (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2; + dst_ptr[4 * x + 1] = + (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2; + dst_ptr[4 * x + 2] = + (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2; + dst_ptr[4 * x + 3] = + (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2; + } +} + +void ScaleUVRowUp2_Bilinear_16_C(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint16_t *s = src_ptr; + const uint16_t *t = src_ptr + src_stride; + uint16_t *d = dst_ptr; + uint16_t *e = dst_ptr + dst_stride; + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + + t[2 * x + 2] * 1 + 8) >> + 4; + d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + + t[2 * x + 3] * 1 + 8) >> + 4; + d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 + + t[2 * x + 2] * 3 + 8) >> + 4; + d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 + + t[2 * x + 3] * 3 + 8) >> + 4; + e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 + + t[2 * x + 2] * 3 + 8) >> + 4; + e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 + + t[2 * x + 3] * 3 + 8) >> + 4; + e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + + t[2 * x + 2] * 9 + 8) >> + 4; + e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + + t[2 * x + 3] * 9 + 8) >> + 4; + } +} + +// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607. +// Mimics SSSE3 blender +#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7 +#define BLENDERC(a, b, f, s) \ + (uint16_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) +#define BLENDER(a, b, f) BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0) + +void ScaleUVFilterCols_C(uint8_t *dst_uv, + const uint8_t *src_uv, + int dst_width, + int x, + int dx) { + const uint16_t *src = (const uint16_t *) (src_uv); + uint16_t *dst = (uint16_t *) (dst_uv); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint16_t a = src[xi]; + uint16_t b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + x += dx; + xi = x >> 16; + xf = (x >> 9) & 0x7f; + a = src[xi]; + b = src[xi + 1]; + dst[1] = BLENDER(a, b, xf); + x += dx; + dst += 2; + } + if (dst_width & 1) { + int xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint16_t a = src[xi]; + uint16_t b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + } +} + +#undef BLENDER1 +#undef BLENDERC +#undef BLENDER + +// Scale plane vertically with bilinear interpolation. +void ScalePlaneVertical(int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t *src_argb, + uint8_t *dst_argb, + int x, + int y, + int dy, + int bpp, // bytes per pixel. 4 for ARGB. + enum FilterMode filtering) { + // TODO(fbarchard): Allow higher bpp. + int dst_width_bytes = dst_width * bpp; + void (*InterpolateRow)(uint8_t *dst_argb, const uint8_t *src_argb, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; + int j; + assert(bpp >= 1 && bpp <= 4); + assert(src_height != 0); + assert(dst_width > 0); + assert(dst_height > 0); + src_argb += (x >> 16) * bpp; +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width_bytes, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif + + + for (j = 0; j < dst_height; ++j) { + int yi; + int yf; + if (y > max_y) { + y = max_y; + } + yi = y >> 16; + yf = filtering ? ((y >> 8) & 255) : 0; + InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride, + dst_width_bytes, yf); + dst_argb += dst_stride; + y += dy; + } +} + +// Simplify the filtering based on scale factors. +enum FilterMode ScaleFilterReduce(int src_width, + int src_height, + int dst_width, + int dst_height, + enum FilterMode filtering) { + if (src_width < 0) { + src_width = -src_width; + } + if (src_height < 0) { + src_height = -src_height; + } + if (filtering == kFilterBox) { + // If scaling either axis to 0.5 or larger, switch from Box to Bilinear. + if (dst_width * 2 >= src_width || dst_height * 2 >= src_height) { + filtering = kFilterBilinear; + } + } + if (filtering == kFilterBilinear) { + if (src_height == 1) { + filtering = kFilterLinear; + } + // TODO(fbarchard): Detect any odd scale factor and reduce to Linear. + if (dst_height == src_height || dst_height * 3 == src_height) { + filtering = kFilterLinear; + } + // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to + // avoid reading 2 pixels horizontally that causes memory exception. + if (src_width == 1) { + filtering = kFilterNone; + } + } + if (filtering == kFilterLinear) { + if (src_width == 1) { + filtering = kFilterNone; + } + // TODO(fbarchard): Detect any odd scale factor and reduce to None. + if (dst_width == src_width || dst_width * 3 == src_width) { + filtering = kFilterNone; + } + } + return filtering; +} + +#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s) + +// Compute slope values for stepping. +void ScaleSlope(int src_width, + int src_height, + int dst_width, + int dst_height, + enum FilterMode filtering, + int *x, + int *y, + int *dx, + int *dy) { + assert(x != NULL); + assert(y != NULL); + assert(dx != NULL); + assert(dy != NULL); + assert(src_width != 0); + assert(src_height != 0); + assert(dst_width > 0); + assert(dst_height > 0); + // Check for 1 pixel and avoid FixedDiv overflow. + if (dst_width == 1 && src_width >= 32768) { + dst_width = src_width; + } + if (dst_height == 1 && src_height >= 32768) { + dst_height = src_height; + } + if (filtering == kFilterBox) { + // Scale step for point sampling duplicates all pixels equally. + *dx = FixedDiv(Abs(src_width), dst_width); + *dy = FixedDiv(src_height, dst_height); + *x = 0; + *y = 0; + } else if (filtering == kFilterBilinear) { + // Scale step for bilinear sampling renders last pixel once for upsample. + if (dst_width <= Abs(src_width)) { + *dx = FixedDiv(Abs(src_width), dst_width); + *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter. + } else if (src_width > 1 && dst_width > 1) { + *dx = FixedDiv1(Abs(src_width), dst_width); + *x = 0; + } + if (dst_height <= src_height) { + *dy = FixedDiv(src_height, dst_height); + *y = CENTERSTART(*dy, -32768); // Subtract 0.5 (32768) to center filter. + } else if (src_height > 1 && dst_height > 1) { + *dy = FixedDiv1(src_height, dst_height); + *y = 0; + } + } else if (filtering == kFilterLinear) { + // Scale step for bilinear sampling renders last pixel once for upsample. + if (dst_width <= Abs(src_width)) { + *dx = FixedDiv(Abs(src_width), dst_width); + *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter. + } else if (src_width > 1 && dst_width > 1) { + *dx = FixedDiv1(Abs(src_width), dst_width); + *x = 0; + } + *dy = FixedDiv(src_height, dst_height); + *y = *dy >> 1; + } else { + // Scale step for point sampling duplicates all pixels equally. + *dx = FixedDiv(Abs(src_width), dst_width); + *dy = FixedDiv(src_height, dst_height); + *x = CENTERSTART(*dx, 0); + *y = CENTERSTART(*dy, 0); + } + // Negative src_width means horizontally mirror. + if (src_width < 0) { + *x += (dst_width - 1) * *dx; + *dx = -*dx; + // src_width = -src_width; // Caller must do this. + } +} + +#undef CENTERSTART diff --git a/pkg/encoder/yuv/libyuv/scale_gcc.c b/pkg/encoder/yuv/libyuv/scale_gcc.c new file mode 100644 index 000000000..716d6cfdb --- /dev/null +++ b/pkg/encoder/yuv/libyuv/scale_gcc.c @@ -0,0 +1,2651 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "row.h" +#include "scale_row.h" + +// This module is for GCC x86 and x64. +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) + +// Offsets for source bytes 0 to 9 +static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, + 128, 128, 128, 128, 128, 128, 128, 128}; + +// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. +static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12, + 128, 128, 128, 128, 128, 128, 128, 128}; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15, + 128, 128, 128, 128, 128, 128, 128, 128}; + +// Offsets for source bytes 0 to 10 +static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; + +// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. +static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, + 8, 9, 9, 10, 10, 11, 12, 13}; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10, + 10, 11, 12, 13, 13, 14, 14, 15}; + +// Coefficients for source bytes 0 to 10 +static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2}; + +// Coefficients for source bytes 10 to 21 +static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1}; + +// Coefficients for source bytes 21 to 31 +static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3}; + +// Coefficients for source bytes 21 to 31 +static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2}; + +static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128}; + +static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3, + 6, 8, 11, 14, 128, 128, 128, 128}; + +// Arrange words 0,3,6 into 0,1,2 +static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128}; + +// Arrange words 0,3,6 into 3,4,5 +static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1, + 6, 7, 12, 13, 128, 128, 128, 128}; + +// Scaling values for boxes of 3x3 and 2x3 +static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, + 65536 / 9, 65536 / 6, 0, 0}; + +// Arrange first value for pixels 0,1,2,3,4,5 +static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128, + 11, 128, 14, 128, 128, 128, 128, 128}; + +// Arrange second value for pixels 0,1,2,3,4,5 +static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128, + 12, 128, 15, 128, 128, 128, 128, 128}; + +// Arrange third value for pixels 0,1,2,3,4,5 +static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128, + 13, 128, 128, 128, 128, 128, 128, 128}; + +// Scaling values for boxes of 3x2 and 2x2 +static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, + 65536 / 3, 65536 / 2, 0, 0}; + +// GCC versions of row functions are verbatim conversions from Visual C. +// Generated using gcc disassembly on Visual C object file: +// objdump -D yuvscaler.obj >yuvscaler.txt + +void ScaleRowDown2_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width) { + (void) src_stride; + asm volatile( + // 16 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); +} + +void ScaleRowDown2Linear_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width) { + (void) src_stride; + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pavgw %%xmm5,%%xmm0 \n" + "pavgw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm4", "xmm5"); +} + +void ScaleRowDown2Box_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width) { + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "psrlw $0x1,%%xmm0 \n" + "psrlw $0x1,%%xmm1 \n" + "pavgw %%xmm5,%%xmm0 \n" + "pavgw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t) (src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} + +#ifdef HAS_SCALEROWDOWN2_AVX2 + +void ScaleRowDown2_AVX2(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width) { + (void) src_stride; + asm volatile(LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); +} + +void ScaleRowDown2Linear_AVX2(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width) { + (void) src_stride; + asm volatile( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm4", "xmm5"); +} + +void ScaleRowDown2Box_AVX2(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" + "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" + "lea 0x40(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vpsrlw $0x1,%%ymm0,%%ymm0 \n" + "vpsrlw $0x1,%%ymm1,%%ymm1 \n" + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t) (src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} + +#endif // HAS_SCALEROWDOWN2_AVX2 + +void ScaleRowDown4_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width) { + (void) src_stride; + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrld $0x18,%%xmm5 \n" + "pslld $0x10,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm5"); +} + +void ScaleRowDown4Box_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width) { + intptr_t stridex3; + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "packuswb %%xmm4,%%xmm4 \n" + "psllw $0x3,%%xmm5 \n" + "lea 0x00(%4,%4,2),%3 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "movdqu 0x00(%0,%4,2),%%xmm2 \n" + "movdqu 0x10(%0,%4,2),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "movdqu 0x00(%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "phaddw %%xmm1,%%xmm0 \n" + "paddw %%xmm5,%%xmm0 \n" + "psrlw $0x4,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "=&r"(stridex3) // %3 + : "r"((intptr_t) (src_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +#ifdef HAS_SCALEROWDOWN4_AVX2 + +void ScaleRowDown4_AVX2(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width) { + (void) src_stride; + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrld $0x18,%%ymm5,%%ymm5 \n" + "vpslld $0x10,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm5"); +} + +void ScaleRowDown4Box_AVX2(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpsllw $0x3,%%ymm4,%%ymm5 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" + "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu 0x00(%0,%3,2),%%ymm2 \n" + "vmovdqu 0x20(%0,%3,2),%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu 0x00(%0,%4,1),%%ymm2 \n" + "vmovdqu 0x20(%0,%4,1),%%ymm3 \n" + "lea 0x40(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" + "vpsrlw $0x4,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t) (src_stride)), // %3 + "r"((intptr_t) (src_stride * 3)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +#endif // HAS_SCALEROWDOWN4_AVX2 + +void ScaleRowDown34_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width) { + (void) src_stride; + asm volatile( + "movdqa %0,%%xmm3 \n" + "movdqa %1,%%xmm4 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kShuf0), // %0 + "m"(kShuf1), // %1 + "m"(kShuf2) // %2 + ); + asm volatile(LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm2 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "palignr $0x8,%%xmm0,%%xmm1 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm5,%%xmm2 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x8(%1) \n" + "movq %%xmm2,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +void ScaleRowDown34_1_Box_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width) { + asm volatile( + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 + : + : "m"(kShuf01), // %0 + "m"(kShuf11), // %1 + "m"(kShuf21) // %2 + ); + asm volatile( + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 + : + : "m"(kMadd01), // %0 + "m"(kMadd11), // %1 + "m"(kRound34) // %2 + ); + asm volatile(LABELALIGN + "1: \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x00(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,(%1) \n" + "movdqu 0x8(%0),%%xmm6 \n" + "movdqu 0x8(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x8(%1) \n" + "movdqu 0x10(%0),%%xmm6 \n" + "movdqu 0x10(%0,%3,1),%%xmm7 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t) (src_stride)), // %3 + "m"(kMadd21) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7"); +} + +void ScaleRowDown34_0_Box_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width) { + asm volatile( + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 + : + : "m"(kShuf01), // %0 + "m"(kShuf11), // %1 + "m"(kShuf21) // %2 + ); + asm volatile( + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 + : + : "m"(kMadd01), // %0 + "m"(kMadd11), // %1 + "m"(kRound34) // %2 + ); + + asm volatile(LABELALIGN + "1: \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x00(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,(%1) \n" + "movdqu 0x8(%0),%%xmm6 \n" + "movdqu 0x8(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x8(%1) \n" + "movdqu 0x10(%0),%%xmm6 \n" + "movdqu 0x10(%0,%3,1),%%xmm7 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t) (src_stride)), // %3 + "m"(kMadd21) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7"); +} + +void ScaleRowDown38_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width) { + (void) src_stride; + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movhlps %%xmm0,%%xmm1 \n" + "movd %%xmm1,0x8(%1) \n" + "lea 0xc(%1),%1 \n" + "sub $0xc,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kShuf38a), // %3 + "m"(kShuf38b) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"); +} + +void ScaleRowDown38_2_Box_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width) { + asm volatile( + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "movdqa %3,%%xmm5 \n" + : + : "m"(kShufAb0), // %0 + "m"(kShufAb1), // %1 + "m"(kShufAb2), // %2 + "m"(kScaleAb2) // %3 + ); + asm volatile(LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%3,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "paddusw %%xmm6,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "paddusw %%xmm0,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movd %%xmm1,(%1) \n" + "psrlq $0x10,%%xmm1 \n" + "movd %%xmm1,0x2(%1) \n" + "lea 0x6(%1),%1 \n" + "sub $0x6,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t) (src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6"); +} + +void ScaleRowDown38_3_Box_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width) { + asm volatile( + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + : + : "m"(kShufAc), // %0 + "m"(kShufAc3), // %1 + "m"(kScaleAc33) // %2 + ); + asm volatile(LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%3,1),%%xmm6 \n" + "movhlps %%xmm0,%%xmm1 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + "movdqu 0x00(%0,%3,2),%%xmm6 \n" + "lea 0x10(%0),%0 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "movdqa %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "pshufb %%xmm3,%%xmm7 \n" + "paddusw %%xmm7,%%xmm6 \n" + "pmulhuw %%xmm4,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movd %%xmm6,(%1) \n" + "psrlq $0x10,%%xmm6 \n" + "movd %%xmm6,0x2(%1) \n" + "lea 0x6(%1),%1 \n" + "sub $0x6,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t) (src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7"); +} + +static const uvec8 kLinearShuffleFar = {2, 3, 0, 1, 6, 7, 4, 5, + 10, 11, 8, 9, 14, 15, 12, 13}; + +static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3, + 3, 1, 1, 3, 3, 1, 1, 3}; + +#ifdef HAS_SCALEROWUP2_LINEAR_SSE2 + +void ScaleRowUp2_Linear_SSE2(const uint8_t *src_ptr, + uint8_t *dst_ptr, + int dst_width) { + asm volatile( + "pxor %%xmm0,%%xmm0 \n" // 0 + "pcmpeqw %%xmm6,%%xmm6 \n" + "psrlw $15,%%xmm6 \n" + "psllw $1,%%xmm6 \n" // all 2 + + LABELALIGN + "1: \n" + "movq (%0),%%xmm1 \n" // 01234567 + "movq 1(%0),%%xmm2 \n" // 12345678 + "movdqa %%xmm1,%%xmm3 \n" + "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 + "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677 + "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 + "movdqa %%xmm1,%%xmm4 \n" + "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16) + "movdqa %%xmm2,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16) + "paddw %%xmm5,%%xmm4 \n" + "movdqa %%xmm3,%%xmm5 \n" + "paddw %%xmm6,%%xmm4 \n" + "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16) + "paddw %%xmm5,%%xmm5 \n" + "paddw %%xmm4,%%xmm5 \n" // 3*near+far+2 (lo) + "psrlw $2,%%xmm5 \n" // 3/4*near+1/4*far (lo) + + "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16) + "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) + "paddw %%xmm2,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) + "paddw %%xmm6,%%xmm1 \n" + "paddw %%xmm3,%%xmm3 \n" + "paddw %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) + "psrlw $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) + + "packuswb %%xmm1,%%xmm5 \n" + "movdqu %%xmm5,(%1) \n" + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2 + +void ScaleRowUp2_Bilinear_SSE2(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + LABELALIGN + "1: \n" + "pxor %%xmm0,%%xmm0 \n" // 0 + // above line + "movq (%0),%%xmm1 \n" // 01234567 + "movq 1(%0),%%xmm2 \n" // 12345678 + "movdqa %%xmm1,%%xmm3 \n" + "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 + "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677 + "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 + + "movdqa %%xmm1,%%xmm4 \n" + "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16) + "movdqa %%xmm2,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16) + "paddw %%xmm5,%%xmm4 \n" // near+far + "movdqa %%xmm3,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16) + "paddw %%xmm5,%%xmm5 \n" // 2*near + "paddw %%xmm5,%%xmm4 \n" // 3*near+far (1, lo) + + "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16) + "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) + "paddw %%xmm2,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) + "paddw %%xmm3,%%xmm3 \n" // 2*near + "paddw %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) + + // below line + "movq (%0,%3),%%xmm6 \n" // 01234567 + "movq 1(%0,%3),%%xmm2 \n" // 12345678 + "movdqa %%xmm6,%%xmm3 \n" + "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 + "punpcklbw %%xmm6,%%xmm6 \n" // 0011223344556677 + "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 + + "movdqa %%xmm6,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" // 00112233 (16) + "movdqa %%xmm2,%%xmm7 \n" + "punpcklbw %%xmm0,%%xmm7 \n" // 11223344 (16) + "paddw %%xmm7,%%xmm5 \n" // near+far + "movdqa %%xmm3,%%xmm7 \n" + "punpcklbw %%xmm0,%%xmm7 \n" // 01122334 (16) + "paddw %%xmm7,%%xmm7 \n" // 2*near + "paddw %%xmm7,%%xmm5 \n" // 3*near+far (2, lo) + + "punpckhbw %%xmm0,%%xmm6 \n" // 44556677 (16) + "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) + "paddw %%xmm6,%%xmm2 \n" // near+far + "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) + "paddw %%xmm3,%%xmm3 \n" // 2*near + "paddw %%xmm3,%%xmm2 \n" // 3*near+far (2, hi) + + // xmm4 xmm1 + // xmm5 xmm2 + "pcmpeqw %%xmm0,%%xmm0 \n" + "psrlw $15,%%xmm0 \n" + "psllw $3,%%xmm0 \n" // all 8 + + "movdqa %%xmm4,%%xmm3 \n" + "movdqa %%xmm5,%%xmm6 \n" + "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (1, lo) + "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, lo) + "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (1, lo) + "paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, lo) + "psrlw $4,%%xmm3 \n" // ^ div by 16 + + "movdqa %%xmm1,%%xmm7 \n" + "movdqa %%xmm2,%%xmm6 \n" + "paddw %%xmm7,%%xmm7 \n" // 6*near+2*far (1, hi) + "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, hi) + "paddw %%xmm1,%%xmm7 \n" // 9*near+3*far (1, hi) + "paddw %%xmm6,%%xmm7 \n" // 9 3 3 1 + 8 (1, hi) + "psrlw $4,%%xmm7 \n" // ^ div by 16 + + "packuswb %%xmm7,%%xmm3 \n" + "movdqu %%xmm3,(%1) \n" // save above line + + "movdqa %%xmm5,%%xmm3 \n" + "paddw %%xmm0,%%xmm4 \n" // 3*near+far+8 (1, lo) + "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, lo) + "paddw %%xmm3,%%xmm5 \n" // 9*near+3*far (2, lo) + "paddw %%xmm4,%%xmm5 \n" // 9 3 3 1 + 8 (lo) + "psrlw $4,%%xmm5 \n" // ^ div by 16 + + "movdqa %%xmm2,%%xmm3 \n" + "paddw %%xmm0,%%xmm1 \n" // 3*near+far+8 (1, hi) + "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, hi) + "paddw %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) + "paddw %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (hi) + "psrlw $4,%%xmm2 \n" // ^ div by 16 + + "packuswb %%xmm2,%%xmm5 \n" + "movdqu %%xmm5,(%1,%4) \n" // save below line + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t) (src_stride)), // %3 + "r"((intptr_t) (dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} + +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3 + +void ScaleRowUp2_Linear_12_SSSE3(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int dst_width) { + asm volatile( + "movdqa %3,%%xmm5 \n" + "pcmpeqw %%xmm4,%%xmm4 \n" + "psrlw $15,%%xmm4 \n" + "psllw $1,%%xmm4 \n" // all 2 + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" // 01234567 (16) + "movdqu 2(%0),%%xmm1 \n" // 12345678 (16) + + "movdqa %%xmm0,%%xmm2 \n" + "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16) + "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16) + + "movdqa %%xmm2,%%xmm3 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm5,%%xmm3 \n" // 54657687 (far) + "pshufb %%xmm5,%%xmm1 \n" // 10213243 (far) + + "paddw %%xmm4,%%xmm1 \n" // far+2 + "paddw %%xmm4,%%xmm3 \n" // far+2 + "paddw %%xmm0,%%xmm1 \n" // near+far+2 + "paddw %%xmm2,%%xmm3 \n" // near+far+2 + "paddw %%xmm0,%%xmm0 \n" // 2*near + "paddw %%xmm2,%%xmm2 \n" // 2*near + "paddw %%xmm1,%%xmm0 \n" // 3*near+far+2 (lo) + "paddw %%xmm3,%%xmm2 \n" // 3*near+far+2 (hi) + + "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far + "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm2,16(%1) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kLinearShuffleFar) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3 + +void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "pcmpeqw %%xmm7,%%xmm7 \n" + "psrlw $15,%%xmm7 \n" + "psllw $3,%%xmm7 \n" // all 8 + "movdqa %5,%%xmm6 \n" + + LABELALIGN + "1: \n" + // above line + "movdqu (%0),%%xmm0 \n" // 01234567 (16) + "movdqu 2(%0),%%xmm1 \n" // 12345678 (16) + "movdqa %%xmm0,%%xmm2 \n" + "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16) + "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16) + "movdqa %%xmm2,%%xmm3 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm6,%%xmm3 \n" // 54657687 (far) + "pshufb %%xmm6,%%xmm1 \n" // 10213243 (far) + "paddw %%xmm0,%%xmm1 \n" // near+far + "paddw %%xmm2,%%xmm3 \n" // near+far + "paddw %%xmm0,%%xmm0 \n" // 2*near + "paddw %%xmm2,%%xmm2 \n" // 2*near + "paddw %%xmm1,%%xmm0 \n" // 3*near+far (1, lo) + "paddw %%xmm3,%%xmm2 \n" // 3*near+far (1, hi) + + // below line + "movdqu (%0,%3,2),%%xmm1 \n" // 01234567 (16) + "movdqu 2(%0,%3,2),%%xmm4 \n" // 12345678 (16) + "movdqa %%xmm1,%%xmm3 \n" + "punpckhwd %%xmm4,%%xmm3 \n" // 45566778 (16) + "punpcklwd %%xmm4,%%xmm1 \n" // 01122334 (16) + "movdqa %%xmm3,%%xmm5 \n" + "movdqa %%xmm1,%%xmm4 \n" + "pshufb %%xmm6,%%xmm5 \n" // 54657687 (far) + "pshufb %%xmm6,%%xmm4 \n" // 10213243 (far) + "paddw %%xmm1,%%xmm4 \n" // near+far + "paddw %%xmm3,%%xmm5 \n" // near+far + "paddw %%xmm1,%%xmm1 \n" // 2*near + "paddw %%xmm3,%%xmm3 \n" // 2*near + "paddw %%xmm4,%%xmm1 \n" // 3*near+far (2, lo) + "paddw %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) + + // xmm0 xmm2 + // xmm1 xmm3 + + "movdqa %%xmm0,%%xmm4 \n" + "movdqa %%xmm1,%%xmm5 \n" + "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, lo) + "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, lo) + "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) + "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) + "psrlw $4,%%xmm4 \n" // ^ div by 16 + "movdqu %%xmm4,(%1) \n" + + "movdqa %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm5 \n" + "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, hi) + "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, hi) + "paddw %%xmm2,%%xmm4 \n" // 9*near+3*far (1, hi) + "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, hi) + "psrlw $4,%%xmm4 \n" // ^ div by 16 + "movdqu %%xmm4,0x10(%1) \n" + + "movdqa %%xmm1,%%xmm4 \n" + "paddw %%xmm7,%%xmm0 \n" // 3*near+far+8 (1, lo) + "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, lo) + "paddw %%xmm4,%%xmm1 \n" // 9*near+3*far (2, lo) + "paddw %%xmm0,%%xmm1 \n" // 9 3 3 1 + 8 (2, lo) + "psrlw $4,%%xmm1 \n" // ^ div by 16 + "movdqu %%xmm1,(%1,%4,2) \n" + + "movdqa %%xmm3,%%xmm4 \n" + "paddw %%xmm7,%%xmm2 \n" // 3*near+far+8 (1, hi) + "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, hi) + "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (2, hi) + "paddw %%xmm2,%%xmm3 \n" // 9 3 3 1 + 8 (2, hi) + "psrlw $4,%%xmm3 \n" // ^ div by 16 + "movdqu %%xmm3,0x10(%1,%4,2) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t) (src_stride)), // %3 + "r"((intptr_t) (dst_stride)), // %4 + "m"(kLinearShuffleFar) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} + +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2 + +void ScaleRowUp2_Linear_16_SSE2(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int dst_width) { + asm volatile( + "pxor %%xmm5,%%xmm5 \n" + "pcmpeqd %%xmm4,%%xmm4 \n" + "psrld $31,%%xmm4 \n" + "pslld $1,%%xmm4 \n" // all 2 + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 0123 (16b) + "movq 2(%0),%%xmm1 \n" // 1234 (16b) + + "punpcklwd %%xmm5,%%xmm0 \n" // 0123 (32b) + "punpcklwd %%xmm5,%%xmm1 \n" // 1234 (32b) + + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + + "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far) + "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far) + + "paddd %%xmm4,%%xmm2 \n" // far+2 (lo) + "paddd %%xmm4,%%xmm3 \n" // far+2 (hi) + "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo) + "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi) + "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) + "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) + "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo) + "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) + + "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) + "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) + "packssdw %%xmm1,%%xmm0 \n" + "pshufd $0b11011000,%%xmm0,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2 + +void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "pxor %%xmm7,%%xmm7 \n" + "pcmpeqd %%xmm6,%%xmm6 \n" + "psrld $31,%%xmm6 \n" + "pslld $3,%%xmm6 \n" // all 8 + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) + "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) + "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v) + "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v) + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo) + "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi) + "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo) + "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi) + "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo) + "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi) + "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) + "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) + + "movq (%0),%%xmm0 \n" // 0123 (16b) + "movq 2(%0),%%xmm1 \n" // 1234 (16b) + "punpcklwd %%xmm7,%%xmm0 \n" // 0123 (32b) + "punpcklwd %%xmm7,%%xmm1 \n" // 1234 (32b) + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far) + "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far) + "paddd %%xmm0,%%xmm2 \n" // near+far (lo) + "paddd %%xmm1,%%xmm3 \n" // near+far (hi) + "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) + "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) + "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) + "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) + + "movq (%0,%3,2),%%xmm2 \n" + "movq 2(%0,%3,2),%%xmm3 \n" + "punpcklwd %%xmm7,%%xmm2 \n" // 0123 (32b) + "punpcklwd %%xmm7,%%xmm3 \n" // 1234 (32b) + "movdqa %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm5 \n" + "pshufd $0b10110001,%%xmm4,%%xmm4 \n" // 1032 (even, far) + "pshufd $0b10110001,%%xmm5,%%xmm5 \n" // 2143 (odd, far) + "paddd %%xmm2,%%xmm4 \n" // near+far (lo) + "paddd %%xmm3,%%xmm5 \n" // near+far (hi) + "paddd %%xmm2,%%xmm2 \n" // 2*near (lo) + "paddd %%xmm3,%%xmm3 \n" // 2*near (hi) + "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo) + "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) + + "movdqa %%xmm0,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) + "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) + "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) + "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) + "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo) + + "movdqa %%xmm2,%%xmm5 \n" + "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo) + "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) + "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo) + "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) + "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo) + + "movdqa %%xmm1,%%xmm0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi) + "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi) + "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi) + "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) + "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi) + + "movdqa %%xmm3,%%xmm2 \n" + "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi) + "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi) + "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) + "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi) + "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi) + + "packssdw %%xmm0,%%xmm4 \n" + "pshufd $0b11011000,%%xmm4,%%xmm4 \n" + "movdqu %%xmm4,(%1) \n" // store above + "packssdw %%xmm2,%%xmm5 \n" + "pshufd $0b11011000,%%xmm5,%%xmm5 \n" + "movdqu %%xmm5,(%1,%4,2) \n" // store below + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t) (src_stride)), // %3 + "r"((intptr_t) (dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} + +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3 + +void ScaleRowUp2_Linear_SSSE3(const uint8_t *src_ptr, + uint8_t *dst_ptr, + int dst_width) { + asm volatile( + "pcmpeqw %%xmm4,%%xmm4 \n" + "psrlw $15,%%xmm4 \n" + "psllw $1,%%xmm4 \n" // all 2 + "movdqa %3,%%xmm3 \n" + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 01234567 + "movq 1(%0),%%xmm1 \n" // 12345678 + "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767 + "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878 + "movdqa %%xmm0,%%xmm2 \n" + "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878 + "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434 + "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (hi) + "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (lo) + "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo) + "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi) + "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) + "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi) + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kLinearMadd31) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} + +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3 + +void ScaleRowUp2_Bilinear_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "pcmpeqw %%xmm6,%%xmm6 \n" + "psrlw $15,%%xmm6 \n" + "psllw $3,%%xmm6 \n" // all 8 + "movdqa %5,%%xmm7 \n" + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 01234567 + "movq 1(%0),%%xmm1 \n" // 12345678 + "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767 + "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878 + "movdqa %%xmm0,%%xmm2 \n" + "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878 + "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434 + "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1, hi) + "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1, lo) + + "movq (%0,%3),%%xmm1 \n" + "movq 1(%0,%3),%%xmm4 \n" + "punpcklwd %%xmm1,%%xmm1 \n" + "punpcklwd %%xmm4,%%xmm4 \n" + "movdqa %%xmm1,%%xmm3 \n" + "punpckhdq %%xmm4,%%xmm3 \n" + "punpckldq %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi) + "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo) + + // xmm0 xmm2 + // xmm1 xmm3 + + "movdqa %%xmm0,%%xmm4 \n" + "movdqa %%xmm1,%%xmm5 \n" + "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) + "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) + "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) + "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) + "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo) + + "movdqa %%xmm1,%%xmm5 \n" + "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo) + "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) + "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo) + "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) + "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo) + + "movdqa %%xmm2,%%xmm0 \n" + "movdqa %%xmm3,%%xmm1 \n" + "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi) + "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi) + "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi) + "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) + "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi) + + "movdqa %%xmm3,%%xmm1 \n" + "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi) + "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi) + "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi) + "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi) + "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi) + + "packuswb %%xmm0,%%xmm4 \n" + "movdqu %%xmm4,(%1) \n" // store above + "packuswb %%xmm1,%%xmm5 \n" + "movdqu %%xmm5,(%1,%4) \n" // store below + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t) (src_stride)), // %3 + "r"((intptr_t) (dst_stride)), // %4 + "m"(kLinearMadd31) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} + +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_AVX2 + +void ScaleRowUp2_Linear_AVX2(const uint8_t *src_ptr, + uint8_t *dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $15,%%ymm4,%%ymm4 \n" + "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 + "vbroadcastf128 %3,%%ymm3 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF + "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0 + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" + "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n" + "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" + "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi) + "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo) + "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo) + "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi) + "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) + "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 16 sample to 32 sample + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kLinearMadd31) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} + +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2 + +void ScaleRowUp2_Bilinear_AVX2(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" + "vpsrlw $15,%%ymm6,%%ymm6 \n" + "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8 + "vbroadcastf128 %5,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF + "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0 + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" + "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n" + "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" + "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi) + "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo) + + "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF + "vmovdqu 1(%0,%3),%%xmm3 \n" // 123456789ABCDEF0 + "vpermq $0b11011000,%%ymm2,%%ymm2 \n" + "vpermq $0b11011000,%%ymm3,%%ymm3 \n" + "vpunpcklwd %%ymm2,%%ymm2,%%ymm2 \n" + "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" + "vpunpckhdq %%ymm3,%%ymm2,%%ymm4 \n" + "vpunpckldq %%ymm3,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi) + "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo) + + // ymm0 ymm1 + // ymm2 ymm3 + + "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) + "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) + "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) + "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) + "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) + + "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) + "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) + "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) + "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) + "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) + + "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) + "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) + "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) + "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) + "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) + + "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) + "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) + "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) + "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) + "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) + + "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n" + "vmovdqu %%ymm4,(%1) \n" // store above + "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n" + "vmovdqu %%ymm5,(%1,%4) \n" // store below + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 16 sample to 32 sample + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t) (src_stride)), // %3 + "r"((intptr_t) (dst_stride)), // %4 + "m"(kLinearMadd31) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} + +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2 + +void ScaleRowUp2_Linear_12_AVX2(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int dst_width) { + asm volatile( + "vbroadcastf128 %3,%%ymm5 \n" + "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $15,%%ymm4,%%ymm4 \n" + "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // 0123456789ABCDEF (16b) + "vmovdqu 2(%0),%%ymm1 \n" // 123456789ABCDEF0 (16b) + + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 012389AB4567CDEF + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 12349ABC5678DEF0 + + "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n" // 899AABBCCDDEEFF0 (near) + "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near) + "vpshufb %%ymm5,%%ymm2,%%ymm3 \n" // 98A9BACBDCEDFE0F (far) + "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far) + + "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // far+2 + "vpaddw %%ymm4,%%ymm3,%%ymm3 \n" // far+2 + "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far+2 + "vpaddw %%ymm2,%%ymm3,%%ymm3 \n" // near+far+2 + "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near + "vpaddw %%ymm2,%%ymm2,%%ymm2 \n" // 2*near + "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 3*near+far+2 + "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 3*near+far+2 + + "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far + "vpsrlw $2,%%ymm2,%%ymm2 \n" // 3/4*near+1/4*far + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm2,32(%1) \n" + + "lea 0x20(%0),%0 \n" + "lea 0x40(%1),%1 \n" // 16 sample to 32 sample + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kLinearShuffleFar) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2 + +void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "vbroadcastf128 %5,%%ymm5 \n" + "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $15,%%ymm4,%%ymm4 \n" + "vpsllw $3,%%ymm4,%%ymm4 \n" // all 8 + + LABELALIGN + "1: \n" + + "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b) + "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b) + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000 + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000 + "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near) + "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far) + "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far + "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near + "vpaddw %%ymm0,%%ymm1,%%ymm2 \n" // 3*near+far (1) + + "vmovdqu (%0,%3,2),%%xmm0 \n" // 01234567 (16b) + "vmovdqu 2(%0,%3,2),%%xmm1 \n" // 12345678 (16b) + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000 + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000 + "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near) + "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far) + "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far + "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near + "vpaddw %%ymm0,%%ymm1,%%ymm3 \n" // 3*near+far (2) + + "vpaddw %%ymm2,%%ymm2,%%ymm0 \n" // 6*near+2*far (1) + "vpaddw %%ymm4,%%ymm3,%%ymm1 \n" // 3*near+far+8 (2) + "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9*near+3*far (1) + "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (1) + "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 + "vmovdqu %%ymm0,(%1) \n" // store above + + "vpaddw %%ymm3,%%ymm3,%%ymm0 \n" // 6*near+2*far (2) + "vpaddw %%ymm4,%%ymm2,%%ymm1 \n" // 3*near+far+8 (1) + "vpaddw %%ymm0,%%ymm3,%%ymm0 \n" // 9*near+3*far (2) + "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (2) + "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 + "vmovdqu %%ymm0,(%1,%4,2) \n" // store below + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t) (src_stride)), // %3 + "r"((intptr_t) (dst_stride)), // %4 + "m"(kLinearShuffleFar) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2 + +void ScaleRowUp2_Linear_16_AVX2(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrld $31,%%ymm4,%%ymm4 \n" + "vpslld $1,%%ymm4,%%ymm4 \n" // all 2 + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v) + "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v) + + "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) + + "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far) + "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far) + + "vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo) + "vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi) + "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi) + "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) + "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) + "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi) + + "vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) + "vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) + "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n" + "vpshufd $0b11011000,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} + +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2 + +void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" + "vpsrld $31,%%ymm6,%%ymm6 \n" + "vpslld $3,%%ymm6,%%ymm6 \n" // all 8 + + LABELALIGN + "1: \n" + + "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v) + "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v) + "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) + "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far) + "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far) + "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi) + "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) + "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) + "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (1, lo) + "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (1, hi) + + "vmovdqu (%0,%3,2),%%xmm2 \n" // 01234567 (16b, 1u1v) + "vmovdqu 2(%0,%3,2),%%xmm3 \n" // 12345678 (16b, 1u1v) + "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v) + "vpshufd $0b10110001,%%ymm2,%%ymm4 \n" // 10325476 (lo, far) + "vpshufd $0b10110001,%%ymm3,%%ymm5 \n" // 21436587 (hi, far) + "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo) + "vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi) + "vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo) + "vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi) + "vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (2, lo) + "vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (2, hi) + + "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) + "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) + "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) + "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) + "vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) + + "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) + "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) + "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) + "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) + "vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) + + "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) + "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) + "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) + "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) + "vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) + + "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) + "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) + "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) + "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) + "vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) + + "vpackusdw %%ymm0,%%ymm4,%%ymm4 \n" + "vpshufd $0b11011000,%%ymm4,%%ymm4 \n" + "vmovdqu %%ymm4,(%1) \n" // store above + "vpackusdw %%ymm2,%%ymm5,%%ymm5 \n" + "vpshufd $0b11011000,%%ymm5,%%ymm5 \n" + "vmovdqu %%ymm5,(%1,%4,2) \n" // store below + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t) (src_stride)), // %3 + "r"((intptr_t) (dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +#endif + +// Reads 16xN bytes and produces 16 shorts at a time. +void ScaleAddRow_SSE2(const uint8_t *src_ptr, + uint16_t *dst_ptr, + int src_width) { + asm volatile("pxor %%xmm5,%%xmm5 \n" + + // 16 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm3 \n" + "lea 0x10(%0),%0 \n" // src_ptr += 16 + "movdqu (%1),%%xmm0 \n" + "movdqu 0x10(%1),%%xmm1 \n" + "movdqa %%xmm3,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpckhbw %%xmm5,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} + +#ifdef HAS_SCALEADDROW_AVX2 + +// Reads 32 bytes and accumulates to 32 shorts at a time. +void ScaleAddRow_AVX2(const uint8_t *src_ptr, + uint16_t *dst_ptr, + int src_width) { + asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm3 \n" + "lea 0x20(%0),%0 \n" // src_ptr += 32 + "vpermq $0xd8,%%ymm3,%%ymm3 \n" + "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" + "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" + "vpaddusw (%1),%%ymm2,%%ymm0 \n" + "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} + +#endif // HAS_SCALEADDROW_AVX2 + +// Constant for making pixels signed to avoid pmaddubsw +// saturation. +static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; + +// Constant for making pixels unsigned and adding .5 for rounding. +static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, + 0x4040, 0x4040, 0x4040, 0x4040}; + +// Bilinear column filtering. SSSE3 version. +void ScaleFilterCols_SSSE3(uint8_t *dst_ptr, + const uint8_t *src_ptr, + int dst_width, + int x, + int dx) { + intptr_t x0, x1, temp_pixel; + asm volatile( + "movd %6,%%xmm2 \n" + "movd %7,%%xmm3 \n" + "movl $0x04040000,%k2 \n" + "movd %k2,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x9,%%xmm6 \n" // 0x007f007f + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $15,%%xmm7 \n" // 0x00010001 + + "pextrw $0x1,%%xmm2,%k3 \n" + "subl $0x2,%5 \n" + "jl 29f \n" + "movdqa %%xmm2,%%xmm0 \n" + "paddd %%xmm3,%%xmm0 \n" + "punpckldq %%xmm0,%%xmm2 \n" + "punpckldq %%xmm3,%%xmm3 \n" + "paddd %%xmm3,%%xmm3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + + LABELALIGN + "2: \n" + "movdqa %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm2 \n" + "movzwl 0x00(%1,%3,1),%k2 \n" + "movd %k2,%%xmm0 \n" + "psrlw $0x9,%%xmm1 \n" + "movzwl 0x00(%1,%4,1),%k2 \n" + "movd %k2,%%xmm4 \n" + "pshufb %%xmm5,%%xmm1 \n" + "punpcklwd %%xmm4,%%xmm0 \n" + "psubb %8,%%xmm0 \n" // make pixels signed. + "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) + + // 1 + "paddusb %%xmm7,%%xmm1 \n" + "pmaddubsw %%xmm0,%%xmm1 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + "paddw %9,%%xmm1 \n" // make pixels unsigned. + "psrlw $0x7,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movd %%xmm1,%k2 \n" + "mov %w2,(%0) \n" + "lea 0x2(%0),%0 \n" + "subl $0x2,%5 \n" + "jge 2b \n" + + LABELALIGN + "29: \n" + "addl $0x1,%5 \n" + "jl 99f \n" + "movzwl 0x00(%1,%3,1),%k2 \n" + "movd %k2,%%xmm0 \n" + "psrlw $0x9,%%xmm2 \n" + "pshufb %%xmm5,%%xmm2 \n" + "psubb %8,%%xmm0 \n" // make pixels signed. + "pxor %%xmm6,%%xmm2 \n" + "paddusb %%xmm7,%%xmm2 \n" + "pmaddubsw %%xmm0,%%xmm2 \n" + "paddw %9,%%xmm2 \n" // make pixels unsigned. + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movd %%xmm2,%k2 \n" + "mov %b2,(%0) \n" + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "=&a"(temp_pixel), // %2 + "=&r"(x0), // %3 + "=&r"(x1), // %4 +#if defined(__x86_64__) + "+rm"(dst_width) // %5 +#else + "+m"(dst_width) // %5 +#endif + : "rm"(x), // %6 + "rm"(dx), // %7 +#if defined(__x86_64__) + "x"(kFsub80), // %8 + "x"(kFadd40) // %9 +#else + "m"(kFsub80), // %8 + "m"(kFadd40) // %9 +#endif + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} + +// Reads 4 pixels, duplicates them and writes 8 pixels. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +void ScaleColsUp2_SSE2(uint8_t *dst_ptr, + const uint8_t *src_ptr, + int dst_width, + int x, + int dx) { + (void) x; + (void) dx; + asm volatile(LABELALIGN + "1: \n" + "movdqu (%1),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "movdqu %%xmm0,(%0) \n" + "movdqu %%xmm1,0x10(%0) \n" + "lea 0x20(%0),%0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); +} + +// Divide num by div and return as 16.16 fixed point result. +int FixedDiv_X86(int num, int div) { + asm volatile( + "cdq \n" + "shld $0x10,%%eax,%%edx \n" + "shl $0x10,%%eax \n" + "idiv %1 \n" + "mov %0, %%eax \n" + : "+a"(num) // %0 + : "c"(div) // %1 + : "memory", "cc", "edx"); + return num; +} + +// Divide num - 1 by div - 1 and return as 16.16 fixed point result. +int FixedDiv1_X86(int num, int div) { + asm volatile( + "cdq \n" + "shld $0x10,%%eax,%%edx \n" + "shl $0x10,%%eax \n" + "sub $0x10001,%%eax \n" + "sbb $0x0,%%edx \n" + "sub $0x1,%1 \n" + "idiv %1 \n" + "mov %0, %%eax \n" + : "+a"(num) // %0 + : "c"(div) // %1 + : "memory", "cc", "edx"); + return num; +} + +#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) || \ + defined(HAS_SCALEUVROWDOWN2BOX_AVX2) + +// Shuffle table for splitting UV into upper and lower part of register. +static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u, + 1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u}; +static const uvec8 kShuffleMergeUV = {0u, 8u, 2u, 10u, 4u, 12u, + 6u, 14u, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}; +#endif + +#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3 + +void ScaleUVRowDown2Box_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width) { + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101 + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5, %%xmm5 \n" // zero + "movdqa %4,%%xmm1 \n" // split shuffler + "movdqa %5,%%xmm3 \n" // merge shuffler + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" // 8 UV row 0 + "movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1 + "lea 0x10(%0),%0 \n" + "pshufb %%xmm1,%%xmm0 \n" // uuuuvvvv + "pshufb %%xmm1,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" // horizontal add + "pmaddubsw %%xmm4,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" // vertical add + "psrlw $0x1,%%xmm0 \n" // round + "pavgw %%xmm5,%%xmm0 \n" + "pshufb %%xmm3,%%xmm0 \n" // merge uv + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" // 4 UV + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t) (src_stride)), // %3 + "m"(kShuffleSplitUV), // %4 + "m"(kShuffleMergeUV) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +#endif // HAS_SCALEUVROWDOWN2BOX_SSSE3 + +#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2 + +void ScaleUVRowDown2Box_AVX2(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101 + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero + "vbroadcastf128 %4,%%ymm1 \n" // split shuffler + "vbroadcastf128 %5,%%ymm3 \n" // merge shuffler + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // 16 UV row 0 + "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1 + "lea 0x20(%0),%0 \n" + "vpshufb %%ymm1,%%ymm0,%%ymm0 \n" // uuuuvvvv + "vpshufb %%ymm1,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // horizontal add + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" // vertical add + "vpsrlw $0x1,%%ymm0,%%ymm0 \n" // round + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm3,%%ymm0,%%ymm0 \n" // merge uv + "vpermq $0xd8,%%ymm0,%%ymm0 \n" // combine qwords + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" // 8 UV + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t) (src_stride)), // %3 + "m"(kShuffleSplitUV), // %4 + "m"(kShuffleMergeUV) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +#endif // HAS_SCALEUVROWDOWN2BOX_AVX2 + +static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3, + 3, 1, 3, 1, 1, 3, 1, 3}; + +#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3 + +void ScaleUVRowUp2_Linear_SSSE3(const uint8_t *src_ptr, + uint8_t *dst_ptr, + int dst_width) { + asm volatile( + "pcmpeqw %%xmm4,%%xmm4 \n" + "psrlw $15,%%xmm4 \n" + "psllw $1,%%xmm4 \n" // all 2 + "movdqa %3,%%xmm3 \n" + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 00112233 (1u1v) + "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v) + "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v) + "movdqa %%xmm0,%%xmm2 \n" + "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v) + "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v) + "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (1u1v16, hi) + "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (1u1v16, lo) + "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo) + "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi) + "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) + "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi) + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 4 uv to 8 uv + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kUVLinearMadd31) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3 + +void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "pcmpeqw %%xmm6,%%xmm6 \n" + "psrlw $15,%%xmm6 \n" + "psllw $3,%%xmm6 \n" // all 8 + "movdqa %5,%%xmm7 \n" + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 00112233 (1u1v) + "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v) + "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v) + "movdqa %%xmm0,%%xmm2 \n" + "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v) + "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v) + "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1u1v16, hi) + "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1u1v16, lo) + + "movq (%0,%3),%%xmm1 \n" + "movq 2(%0,%3),%%xmm4 \n" + "punpcklbw %%xmm4,%%xmm1 \n" + "movdqa %%xmm1,%%xmm3 \n" + "punpckhdq %%xmm1,%%xmm3 \n" + "punpckldq %%xmm1,%%xmm1 \n" + "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi) + "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo) + + // xmm0 xmm2 + // xmm1 xmm3 + + "movdqa %%xmm0,%%xmm4 \n" + "movdqa %%xmm1,%%xmm5 \n" + "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) + "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) + "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) + "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) + "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo) + + "movdqa %%xmm1,%%xmm5 \n" + "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo) + "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) + "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo) + "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) + "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo) + + "movdqa %%xmm2,%%xmm0 \n" + "movdqa %%xmm3,%%xmm1 \n" + "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi) + "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi) + "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi) + "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) + "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi) + + "movdqa %%xmm3,%%xmm1 \n" + "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi) + "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi) + "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi) + "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi) + "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi) + + "packuswb %%xmm0,%%xmm4 \n" + "movdqu %%xmm4,(%1) \n" // store above + "packuswb %%xmm1,%%xmm5 \n" + "movdqu %%xmm5,(%1,%4) \n" // store below + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 4 uv to 8 uv + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t) (src_stride)), // %3 + "r"((intptr_t) (dst_stride)), // %4 + "m"(kUVLinearMadd31) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} + +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2 + +void ScaleUVRowUp2_Linear_AVX2(const uint8_t *src_ptr, + uint8_t *dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $15,%%ymm4,%%ymm4 \n" + "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 + "vbroadcastf128 %3,%%ymm3 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%xmm0 \n" + "vmovdqu 2(%0),%%xmm1 \n" + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" + "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n" + "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi) + "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo) + "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo) + "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi) + "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) + "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 uv to 16 uv + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kUVLinearMadd31) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} + +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2 + +void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" + "vpsrlw $15,%%ymm6,%%ymm6 \n" + "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8 + "vbroadcastf128 %5,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%xmm0 \n" + "vmovdqu 2(%0),%%xmm1 \n" + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" + "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n" + "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi) + "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo) + + "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF + "vmovdqu 2(%0,%3),%%xmm3 \n" // 123456789ABCDEF0 + "vpermq $0b11011000,%%ymm2,%%ymm2 \n" + "vpermq $0b11011000,%%ymm3,%%ymm3 \n" + "vpunpcklbw %%ymm3,%%ymm2,%%ymm2 \n" + "vpunpckhdq %%ymm2,%%ymm2,%%ymm4 \n" + "vpunpckldq %%ymm2,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi) + "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo) + + // ymm0 ymm1 + // ymm2 ymm3 + + "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) + "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) + "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) + "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) + "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) + + "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) + "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) + "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) + "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) + "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) + + "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) + "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) + "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) + "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) + "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) + + "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) + "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) + "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) + "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) + "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) + + "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n" + "vmovdqu %%ymm4,(%1) \n" // store above + "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n" + "vmovdqu %%ymm5,(%1,%4) \n" // store below + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 uv to 16 uv + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t) (src_stride)), // %3 + "r"((intptr_t) (dst_stride)), // %4 + "m"(kUVLinearMadd31) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} + +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41 + +void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int dst_width) { + asm volatile( + "pxor %%xmm5,%%xmm5 \n" + "pcmpeqd %%xmm4,%%xmm4 \n" + "psrld $31,%%xmm4 \n" + "pslld $1,%%xmm4 \n" // all 2 + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) + "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) + + "punpcklwd %%xmm5,%%xmm0 \n" // 0011 (32b, 1u1v) + "punpcklwd %%xmm5,%%xmm1 \n" // 1122 (32b, 1u1v) + + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + + "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (lo, far) + "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (hi, far) + + "paddd %%xmm4,%%xmm2 \n" // far+2 (lo) + "paddd %%xmm4,%%xmm3 \n" // far+2 (hi) + "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo) + "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi) + "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) + "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) + "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo) + "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) + + "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) + "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) + "packusdw %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 2 uv to 4 uv + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 + +void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "pxor %%xmm7,%%xmm7 \n" + "pcmpeqd %%xmm6,%%xmm6 \n" + "psrld $31,%%xmm6 \n" + "pslld $3,%%xmm6 \n" // all 8 + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) + "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) + "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v) + "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v) + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo) + "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi) + "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo) + "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi) + "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo) + "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi) + "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) + "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) + + "movq (%0,%3,2),%%xmm2 \n" + "movq 4(%0,%3,2),%%xmm3 \n" + "punpcklwd %%xmm7,%%xmm2 \n" + "punpcklwd %%xmm7,%%xmm3 \n" + "movdqa %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm5 \n" + "pshufd $0b01001110,%%xmm4,%%xmm4 \n" // 1100 (far) (2, lo) + "pshufd $0b01001110,%%xmm5,%%xmm5 \n" // 2211 (far) (2, hi) + "paddd %%xmm2,%%xmm4 \n" // near+far (2, lo) + "paddd %%xmm3,%%xmm5 \n" // near+far (2, hi) + "paddd %%xmm2,%%xmm2 \n" // 2*near (2, lo) + "paddd %%xmm3,%%xmm3 \n" // 2*near (2, hi) + "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo) + "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) + + "movdqa %%xmm0,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) + "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) + "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) + "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) + "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo) + + "movdqa %%xmm2,%%xmm5 \n" + "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo) + "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) + "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo) + "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) + "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo) + + "movdqa %%xmm1,%%xmm0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi) + "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi) + "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi) + "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) + "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi) + + "movdqa %%xmm3,%%xmm2 \n" + "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi) + "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi) + "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) + "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi) + "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi) + + "packusdw %%xmm0,%%xmm4 \n" + "movdqu %%xmm4,(%1) \n" // store above + "packusdw %%xmm2,%%xmm5 \n" + "movdqu %%xmm5,(%1,%4,2) \n" // store below + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 2 uv to 4 uv + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t) (src_stride)), // %3 + "r"((intptr_t) (dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} + +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2 + +void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrld $31,%%ymm4,%%ymm4 \n" + "vpslld $1,%%ymm4,%%ymm4 \n" // all 2 + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v) + "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v) + + "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) + + "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far) + "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far) + + "vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo) + "vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi) + "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi) + "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) + "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) + "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi) + + "vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) + "vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) + "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 4 uv to 8 uv + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} + +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2 + +void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" + "vpsrld $31,%%ymm6,%%ymm6 \n" + "vpslld $3,%%ymm6,%%ymm6 \n" // all 8 + + LABELALIGN + "1: \n" + + "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v) + "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v) + "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) + "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far) + "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far) + "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi) + "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) + "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) + "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (hi) + + "vmovdqu (%0,%3,2),%%xmm2 \n" // 00112233 (16b, 1u1v) + "vmovdqu 4(%0,%3,2),%%xmm3 \n" // 11223344 (16b, 1u1v) + "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v) + "vpshufd $0b01001110,%%ymm2,%%ymm4 \n" // 11003322 (lo, far) + "vpshufd $0b01001110,%%ymm3,%%ymm5 \n" // 22114433 (hi, far) + "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo) + "vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi) + "vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo) + "vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi) + "vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (lo) + "vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (hi) + + "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) + "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) + "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) + "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) + "vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) + + "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) + "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) + "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) + "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) + "vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) + + "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) + "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) + "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) + "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) + "vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) + + "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) + "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) + "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) + "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) + "vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) + + "vpackusdw %%ymm0,%%ymm4,%%ymm4 \n" + "vmovdqu %%ymm4,(%1) \n" // store above + "vpackusdw %%ymm2,%%ymm5,%%ymm5 \n" + "vmovdqu %%ymm5,(%1,%4,2) \n" // store below + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 4 uv to 8 uv + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t) (src_stride)), // %3 + "r"((intptr_t) (dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +#endif + +#endif // defined(__x86_64__) || defined(__i386__) diff --git a/pkg/encoder/yuv/libyuv/scale_row.h b/pkg/encoder/yuv/libyuv/scale_row.h new file mode 100644 index 000000000..16389cdcf --- /dev/null +++ b/pkg/encoder/yuv/libyuv/scale_row.h @@ -0,0 +1,768 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_ +#define INCLUDE_LIBYUV_SCALE_ROW_H_ + +#include "basic_types.h" +#include "scale.h" + +#if defined(__pnacl__) || defined(__CLR_VER) || \ + (defined(__native_client__) && defined(__x86_64__)) || \ + (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) +#define LIBYUV_DISABLE_X86 +#endif +#if defined(__native_client__) +#define LIBYUV_DISABLE_NEON +#endif +// MemorySanitizer does not support assembly code yet. http://crbug.com/344505 +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON) +#define LIBYUV_DISABLE_NEON +#endif +#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86) +#define LIBYUV_DISABLE_X86 +#endif +#endif +// GCC >= 4.7.0 required for AVX2. +#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) +#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) +#define GCC_HAS_AVX2 1 +#endif // GNUC >= 4.7 +#endif // __GNUC__ + +// The following are available on all x86 platforms: +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +#define HAS_FIXEDDIV1_X86 +#define HAS_FIXEDDIV_X86 +#define HAS_SCALEADDROW_SSE2 +#define HAS_SCALECOLSUP2_SSE2 +#define HAS_SCALEFILTERCOLS_SSSE3 +#define HAS_SCALEROWDOWN2_SSSE3 +#define HAS_SCALEROWDOWN34_SSSE3 +#define HAS_SCALEROWDOWN38_SSSE3 +#define HAS_SCALEROWDOWN4_SSSE3 +#endif + +// The following are available for gcc/clang x86 platforms: +// TODO(fbarchard): Port to Visual C +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) +#define HAS_SCALEUVROWDOWN2BOX_SSSE3 +#define HAS_SCALEROWUP2_LINEAR_SSE2 +#define HAS_SCALEROWUP2_LINEAR_SSSE3 +#define HAS_SCALEROWUP2_BILINEAR_SSE2 +#define HAS_SCALEROWUP2_BILINEAR_SSSE3 +#define HAS_SCALEROWUP2_LINEAR_12_SSSE3 +#define HAS_SCALEROWUP2_BILINEAR_12_SSSE3 +#define HAS_SCALEROWUP2_LINEAR_16_SSE2 +#define HAS_SCALEROWUP2_BILINEAR_16_SSE2 +#define HAS_SCALEUVROWUP2_LINEAR_SSSE3 +#define HAS_SCALEUVROWUP2_BILINEAR_SSSE3 +#define HAS_SCALEUVROWUP2_LINEAR_16_SSE41 +#define HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 +#endif + +// The following are available for gcc/clang x86 platforms, but +// require clang 3.4 or gcc 4.7. +// TODO(fbarchard): Port to Visual C +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || defined(__i386__)) && \ + (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) +#define HAS_SCALEUVROWDOWN2BOX_AVX2 +#define HAS_SCALEROWUP2_LINEAR_AVX2 +#define HAS_SCALEROWUP2_BILINEAR_AVX2 +#define HAS_SCALEROWUP2_LINEAR_12_AVX2 +#define HAS_SCALEROWUP2_BILINEAR_12_AVX2 +#define HAS_SCALEROWUP2_LINEAR_16_AVX2 +#define HAS_SCALEROWUP2_BILINEAR_16_AVX2 +#define HAS_SCALEUVROWUP2_LINEAR_AVX2 +#define HAS_SCALEUVROWUP2_BILINEAR_AVX2 +#define HAS_SCALEUVROWUP2_LINEAR_16_AVX2 +#define HAS_SCALEUVROWUP2_BILINEAR_16_AVX2 +#endif + +// The following are available on all x86 platforms, but +// require VS2012, clang 3.4 or gcc 4.7. +// The code supports NaCL but requires a new compiler and validator. +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \ + defined(GCC_HAS_AVX2)) +#define HAS_SCALEADDROW_AVX2 +#define HAS_SCALEROWDOWN2_AVX2 +#define HAS_SCALEROWDOWN4_AVX2 +#endif + +// Scale ARGB vertically with bilinear interpolation. +void ScalePlaneVertical(int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t *src_argb, + uint8_t *dst_argb, + int x, + int y, + int dy, + int bpp, + enum FilterMode filtering); + +// Simplify the filtering based on scale factors. +enum FilterMode ScaleFilterReduce(int src_width, + int src_height, + int dst_width, + int dst_height, + enum FilterMode filtering); + +// Divide num by div and return as 16.16 fixed point result. +int FixedDiv_X86(int num, int div); + +int FixedDiv1_X86(int num, int div); + +#ifdef HAS_FIXEDDIV_X86 +#define FixedDiv FixedDiv_X86 +#define FixedDiv1 FixedDiv1_X86 +#endif + +// Compute slope values for stepping. +void ScaleSlope(int src_width, + int src_height, + int dst_width, + int dst_height, + enum FilterMode filtering, + int *x, + int *y, + int *dx, + int *dy); + +void ScaleRowDown2_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst, + int dst_width); + +void ScaleRowDown2Linear_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst, + int dst_width); + +void ScaleRowDown2Box_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst, + int dst_width); + +void ScaleRowDown2Box_Odd_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst, + int dst_width); + +void ScaleRowDown4_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst, + int dst_width); + +void ScaleRowDown4Box_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst, + int dst_width); + +void ScaleRowDown34_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst, + int dst_width); + +void ScaleRowDown34_0_Box_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *d, + int dst_width); + +void ScaleRowDown34_1_Box_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *d, + int dst_width); + +void ScaleRowUp2_Linear_C(const uint8_t *src_ptr, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowUp2_Bilinear_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +void ScaleRowUp2_Linear_16_C(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int dst_width); + +void ScaleRowUp2_Bilinear_16_C(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +void ScaleRowUp2_Linear_Any_C(const uint8_t *src_ptr, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowUp2_Bilinear_Any_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +void ScaleRowUp2_Linear_16_Any_C(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int dst_width); + +void ScaleRowUp2_Bilinear_16_Any_C(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +void ScaleCols_C(uint8_t *dst_ptr, + const uint8_t *src_ptr, + int dst_width, + int x, + int dx); + +void ScaleColsUp2_C(uint8_t *dst_ptr, + const uint8_t *src_ptr, + int dst_width, + int, + int); + +void ScaleFilterCols_C(uint8_t *dst_ptr, + const uint8_t *src_ptr, + int dst_width, + int x, + int dx); + +void ScaleFilterCols64_C(uint8_t *dst_ptr, + const uint8_t *src_ptr, + int dst_width, + int x32, + int dx); + +void ScaleRowDown38_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst, + int dst_width); + +void ScaleRowDown38_3_Box_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown38_2_Box_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleAddRow_C(const uint8_t *src_ptr, uint16_t *dst_ptr, int src_width); + +void ScaleUVRowDown2_C(const uint8_t *src_uv, + ptrdiff_t src_stride, + uint8_t *dst_uv, + int dst_width); + +void ScaleUVRowDown2Linear_C(const uint8_t *src_uv, + ptrdiff_t src_stride, + uint8_t *dst_uv, + int dst_width); + +void ScaleUVRowDown2Box_C(const uint8_t *src_uv, + ptrdiff_t src_stride, + uint8_t *dst_uv, + int dst_width); + +void ScaleUVRowDownEven_C(const uint8_t *src_uv, + ptrdiff_t src_stride, + int src_stepx, + uint8_t *dst_uv, + int dst_width); + +void ScaleUVRowUp2_Linear_C(const uint8_t *src_ptr, + uint8_t *dst_ptr, + int dst_width); + +void ScaleUVRowUp2_Bilinear_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +void ScaleUVRowUp2_Linear_Any_C(const uint8_t *src_ptr, + uint8_t *dst_ptr, + int dst_width); + +void ScaleUVRowUp2_Bilinear_Any_C(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +void ScaleUVRowUp2_Linear_16_C(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int dst_width); + +void ScaleUVRowUp2_Bilinear_16_C(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +void ScaleUVRowUp2_Linear_16_Any_C(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int dst_width); + +void ScaleUVRowUp2_Bilinear_16_Any_C(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +// Specialized scalers for x86. +void ScaleRowDown2_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown2Linear_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown2Box_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown2_AVX2(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown2Linear_AVX2(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown2Box_AVX2(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown4_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown4Box_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown4_AVX2(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown4Box_AVX2(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown34_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown34_1_Box_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown34_0_Box_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown38_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown38_3_Box_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown38_2_Box_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowUp2_Linear_SSE2(const uint8_t *src_ptr, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowUp2_Bilinear_SSE2(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +void ScaleRowUp2_Linear_12_SSSE3(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int dst_width); + +void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +void ScaleRowUp2_Linear_16_SSE2(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int dst_width); + +void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +void ScaleRowUp2_Linear_SSSE3(const uint8_t *src_ptr, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowUp2_Bilinear_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +void ScaleRowUp2_Linear_AVX2(const uint8_t *src_ptr, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowUp2_Bilinear_AVX2(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +void ScaleRowUp2_Linear_12_AVX2(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int dst_width); + +void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +void ScaleRowUp2_Linear_16_AVX2(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int dst_width); + +void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +void ScaleRowUp2_Linear_Any_SSE2(const uint8_t *src_ptr, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowUp2_Bilinear_Any_SSE2(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +void ScaleRowUp2_Linear_12_Any_SSSE3(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int dst_width); + +void ScaleRowUp2_Bilinear_12_Any_SSSE3(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +void ScaleRowUp2_Linear_16_Any_SSE2(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int dst_width); + +void ScaleRowUp2_Bilinear_16_Any_SSE2(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +void ScaleRowUp2_Linear_Any_SSSE3(const uint8_t *src_ptr, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowUp2_Bilinear_Any_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +void ScaleRowUp2_Linear_Any_AVX2(const uint8_t *src_ptr, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowUp2_Bilinear_Any_AVX2(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +void ScaleRowUp2_Linear_12_Any_AVX2(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int dst_width); + +void ScaleRowUp2_Bilinear_12_Any_AVX2(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +void ScaleRowUp2_Linear_16_Any_AVX2(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int dst_width); + +void ScaleRowUp2_Bilinear_16_Any_AVX2(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +void ScaleRowDown2_Any_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown2Linear_Any_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown2Box_Any_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown2Box_Odd_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown2_Any_AVX2(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown2Linear_Any_AVX2(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown2Box_Any_AVX2(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown2Box_Odd_AVX2(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown4_Any_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown4Box_Any_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown4_Any_AVX2(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown4Box_Any_AVX2(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown34_Any_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown34_1_Box_Any_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown34_0_Box_Any_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown38_Any_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown38_3_Box_Any_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleRowDown38_2_Box_Any_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleAddRow_SSE2(const uint8_t *src_ptr, uint16_t *dst_ptr, int src_width); + +void ScaleAddRow_AVX2(const uint8_t *src_ptr, uint16_t *dst_ptr, int src_width); + +void ScaleAddRow_Any_SSE2(const uint8_t *src_ptr, + uint16_t *dst_ptr, + int src_width); + +void ScaleAddRow_Any_AVX2(const uint8_t *src_ptr, + uint16_t *dst_ptr, + int src_width); + +void ScaleFilterCols_SSSE3(uint8_t *dst_ptr, + const uint8_t *src_ptr, + int dst_width, + int x, + int dx); + +void ScaleColsUp2_SSE2(uint8_t *dst_ptr, + const uint8_t *src_ptr, + int dst_width, + int x, + int dx); + +// UV Row functions +void ScaleUVRowDown2Box_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_uv, + int dst_width); + +void ScaleUVRowDown2Box_AVX2(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_uv, + int dst_width); + +void ScaleUVRowDown2Box_Any_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleUVRowDown2Box_Any_AVX2(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + int dst_width); + +void ScaleUVRowUp2_Linear_SSSE3(const uint8_t *src_ptr, + uint8_t *dst_ptr, + int dst_width); + +void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +void ScaleUVRowUp2_Linear_Any_SSSE3(const uint8_t *src_ptr, + uint8_t *dst_ptr, + int dst_width); + +void ScaleUVRowUp2_Bilinear_Any_SSSE3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +void ScaleUVRowUp2_Linear_AVX2(const uint8_t *src_ptr, + uint8_t *dst_ptr, + int dst_width); + +void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +void ScaleUVRowUp2_Linear_Any_AVX2(const uint8_t *src_ptr, + uint8_t *dst_ptr, + int dst_width); + +void ScaleUVRowUp2_Bilinear_Any_AVX2(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int dst_width); + +void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +void ScaleUVRowUp2_Linear_16_Any_SSE41(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int dst_width); + +void ScaleUVRowUp2_Bilinear_16_Any_SSE41(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int dst_width); + +void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +void ScaleUVRowUp2_Linear_16_Any_AVX2(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int dst_width); + +void ScaleUVRowUp2_Bilinear_16_Any_AVX2(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + +#endif // INCLUDE_LIBYUV_SCALE_ROW_H_ \ No newline at end of file diff --git a/pkg/encoder/yuv/libyuv/version.h b/pkg/encoder/yuv/libyuv/version.h new file mode 100644 index 000000000..d45ef09d6 --- /dev/null +++ b/pkg/encoder/yuv/libyuv/version.h @@ -0,0 +1,16 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_VERSION_H_ +#define INCLUDE_LIBYUV_VERSION_H_ + +#define LIBYUV_VERSION 1875 + +#endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/pkg/encoder/yuv/libyuv/video_common.c b/pkg/encoder/yuv/libyuv/video_common.c new file mode 100644 index 000000000..e492402e8 --- /dev/null +++ b/pkg/encoder/yuv/libyuv/video_common.c @@ -0,0 +1,50 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "video_common.h" + +struct FourCCAliasEntry { + uint32_t alias; + uint32_t canonical; +}; + +#define NUM_ALIASES 18 +static const struct FourCCAliasEntry kFourCCAliases[NUM_ALIASES] = { + {FOURCC_IYUV, FOURCC_I420}, + {FOURCC_YU12, FOURCC_I420}, + {FOURCC_YU16, FOURCC_I422}, + {FOURCC_YU24, FOURCC_I444}, + {FOURCC_YUYV, FOURCC_YUY2}, + {FOURCC_YUVS, FOURCC_YUY2}, // kCMPixelFormat_422YpCbCr8_yuvs + {FOURCC_HDYC, FOURCC_UYVY}, + {FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8 + {FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not. + {FOURCC_DMB1, FOURCC_MJPG}, + {FOURCC_BA81, FOURCC_BGGR}, // deprecated. + {FOURCC_RGB3, FOURCC_RAW}, + {FOURCC_BGR3, FOURCC_24BG}, + {FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB + {FOURCC_CM24, FOURCC_RAW}, // kCMPixelFormat_24RGB + {FOURCC_L555, FOURCC_RGBO}, // kCMPixelFormat_16LE555 + {FOURCC_L565, FOURCC_RGBP}, // kCMPixelFormat_16LE565 + {FOURCC_5551, FOURCC_RGBO}, // kCMPixelFormat_16LE5551 +}; + +LIBYUV_API +uint32_t CanonicalFourCC(uint32_t fourcc) { + int i; + for (i = 0; i < NUM_ALIASES; ++i) { + if (kFourCCAliases[i].alias == fourcc) { + return kFourCCAliases[i].canonical; + } + } + // Not an alias, so return it as-is. + return fourcc; +} diff --git a/pkg/encoder/yuv/libyuv/video_common.h b/pkg/encoder/yuv/libyuv/video_common.h new file mode 100644 index 000000000..e2aacf44c --- /dev/null +++ b/pkg/encoder/yuv/libyuv/video_common.h @@ -0,0 +1,212 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// Common definitions for video, including fourcc and VideoFormat. + +#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_ +#define INCLUDE_LIBYUV_VIDEO_COMMON_H_ + +#include "basic_types.h" + +////////////////////////////////////////////////////////////////////////////// +// Definition of FourCC codes +////////////////////////////////////////////////////////////////////////////// + +// Convert four characters to a FourCC code. +// Needs to be a macro otherwise the OS X compiler complains when the kFormat* +// constants are used in a switch. +#ifdef __cplusplus +#define FOURCC(a, b, c, d) \ + ((static_cast(a)) | (static_cast(b) << 8) | \ + (static_cast(c) << 16) | /* NOLINT */ \ + (static_cast(d) << 24)) /* NOLINT */ +#else +#define FOURCC(a, b, c, d) \ + (((uint32_t)(a)) | ((uint32_t)(b) << 8) | /* NOLINT */ \ + ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24)) /* NOLINT */ +#endif + +// Some pages discussing FourCC codes: +// http://www.fourcc.org/yuv.php +// http://v4l2spec.bytesex.org/spec/book1.htm +// http://developer.apple.com/quicktime/icefloe/dispatch020.html +// http://msdn.microsoft.com/library/windows/desktop/dd206750.aspx#nv12 +// http://people.xiph.org/~xiphmont/containers/nut/nut4cc.txt + +// FourCC codes grouped according to implementation efficiency. +// Primary formats should convert in 1 efficient step. +// Secondary formats are converted in 2 steps. +// Auxilliary formats call primary converters. +enum FourCC { + // 10 Primary YUV formats: 5 planar, 2 biplanar, 2 packed. + FOURCC_I420 = FOURCC('I', '4', '2', '0'), + FOURCC_I422 = FOURCC('I', '4', '2', '2'), + FOURCC_I444 = FOURCC('I', '4', '4', '4'), + FOURCC_I400 = FOURCC('I', '4', '0', '0'), + FOURCC_NV21 = FOURCC('N', 'V', '2', '1'), + FOURCC_NV12 = FOURCC('N', 'V', '1', '2'), + FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'), + FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'), + FOURCC_I010 = FOURCC('I', '0', '1', '0'), // bt.601 10 bit 420 + FOURCC_I210 = FOURCC('I', '2', '1', '0'), // bt.601 10 bit 422 + + // 1 Secondary YUV format: row biplanar. deprecated. + FOURCC_M420 = FOURCC('M', '4', '2', '0'), + + // 13 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc 2 64 bpp + FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'), + FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'), + FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'), + FOURCC_AR30 = FOURCC('A', 'R', '3', '0'), // 10 bit per channel. 2101010. + FOURCC_AB30 = FOURCC('A', 'B', '3', '0'), // ABGR version of 10 bit + FOURCC_AR64 = FOURCC('A', 'R', '6', '4'), // 16 bit per channel. + FOURCC_AB64 = FOURCC('A', 'B', '6', '4'), // ABGR version of 16 bit + FOURCC_24BG = FOURCC('2', '4', 'B', 'G'), + FOURCC_RAW = FOURCC('r', 'a', 'w', ' '), + FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'), + FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'), // rgb565 LE. + FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // argb1555 LE. + FOURCC_R444 = FOURCC('R', '4', '4', '4'), // argb4444 LE. + + // 1 Primary Compressed YUV format. + FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'), + + // 14 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias. + FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'), + FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'), + FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'), + FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Linux version of I420. + FOURCC_J420 = + FOURCC('J', '4', '2', '0'), // jpeg (bt.601 full), unofficial fourcc + FOURCC_J422 = + FOURCC('J', '4', '2', '2'), // jpeg (bt.601 full), unofficial fourcc + FOURCC_J444 = + FOURCC('J', '4', '4', '4'), // jpeg (bt.601 full), unofficial fourcc + FOURCC_J400 = + FOURCC('J', '4', '0', '0'), // jpeg (bt.601 full), unofficial fourcc + FOURCC_F420 = FOURCC('F', '4', '2', '0'), // bt.709 full, unofficial fourcc + FOURCC_F422 = FOURCC('F', '4', '2', '2'), // bt.709 full, unofficial fourcc + FOURCC_F444 = FOURCC('F', '4', '4', '4'), // bt.709 full, unofficial fourcc + FOURCC_H420 = FOURCC('H', '4', '2', '0'), // bt.709, unofficial fourcc + FOURCC_H422 = FOURCC('H', '4', '2', '2'), // bt.709, unofficial fourcc + FOURCC_H444 = FOURCC('H', '4', '4', '4'), // bt.709, unofficial fourcc + FOURCC_U420 = FOURCC('U', '4', '2', '0'), // bt.2020, unofficial fourcc + FOURCC_U422 = FOURCC('U', '4', '2', '2'), // bt.2020, unofficial fourcc + FOURCC_U444 = FOURCC('U', '4', '4', '4'), // bt.2020, unofficial fourcc + FOURCC_F010 = FOURCC('F', '0', '1', '0'), // bt.709 full range 10 bit 420 + FOURCC_H010 = FOURCC('H', '0', '1', '0'), // bt.709 10 bit 420 + FOURCC_U010 = FOURCC('U', '0', '1', '0'), // bt.2020 10 bit 420 + FOURCC_F210 = FOURCC('F', '2', '1', '0'), // bt.709 full range 10 bit 422 + FOURCC_H210 = FOURCC('H', '2', '1', '0'), // bt.709 10 bit 422 + FOURCC_U210 = FOURCC('U', '2', '1', '0'), // bt.2020 10 bit 422 + FOURCC_P010 = FOURCC('P', '0', '1', '0'), + FOURCC_P210 = FOURCC('P', '2', '1', '0'), + + // 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc. + FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420. + FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'), // Alias for I422. + FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'), // Alias for I444. + FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'), // Alias for YUY2. + FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'), // Alias for YUY2 on Mac. + FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'), // Alias for UYVY. + FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'), // Alias for UYVY on Mac. + FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'), // Alias for MJPG. + FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'), // Alias for MJPG on Mac. + FOURCC_BA81 = FOURCC('B', 'A', '8', '1'), // Alias for BGGR. + FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'), // Alias for RAW. + FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'), // Alias for 24BG. + FOURCC_CM32 = FOURCC(0, 0, 0, 32), // Alias for BGRA kCMPixelFormat_32ARGB + FOURCC_CM24 = FOURCC(0, 0, 0, 24), // Alias for RAW kCMPixelFormat_24RGB + FOURCC_L555 = FOURCC('L', '5', '5', '5'), // Alias for RGBO. + FOURCC_L565 = FOURCC('L', '5', '6', '5'), // Alias for RGBP. + FOURCC_5551 = FOURCC('5', '5', '5', '1'), // Alias for RGBO. + + // deprecated formats. Not supported, but defined for backward compatibility. + FOURCC_I411 = FOURCC('I', '4', '1', '1'), + FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), + FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'), + FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'), + FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'), + FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'), + FOURCC_H264 = FOURCC('H', '2', '6', '4'), + + // Match any fourcc. + FOURCC_ANY = -1, +}; + +enum FourCCBpp { + // Canonical fourcc codes used in our code. + FOURCC_BPP_I420 = 12, + FOURCC_BPP_I422 = 16, + FOURCC_BPP_I444 = 24, + FOURCC_BPP_I411 = 12, + FOURCC_BPP_I400 = 8, + FOURCC_BPP_NV21 = 12, + FOURCC_BPP_NV12 = 12, + FOURCC_BPP_YUY2 = 16, + FOURCC_BPP_UYVY = 16, + FOURCC_BPP_M420 = 12, // deprecated + FOURCC_BPP_Q420 = 12, + FOURCC_BPP_ARGB = 32, + FOURCC_BPP_BGRA = 32, + FOURCC_BPP_ABGR = 32, + FOURCC_BPP_RGBA = 32, + FOURCC_BPP_AR30 = 32, + FOURCC_BPP_AB30 = 32, + FOURCC_BPP_AR64 = 64, + FOURCC_BPP_AB64 = 64, + FOURCC_BPP_24BG = 24, + FOURCC_BPP_RAW = 24, + FOURCC_BPP_RGBP = 16, + FOURCC_BPP_RGBO = 16, + FOURCC_BPP_R444 = 16, + FOURCC_BPP_RGGB = 8, + FOURCC_BPP_BGGR = 8, + FOURCC_BPP_GRBG = 8, + FOURCC_BPP_GBRG = 8, + FOURCC_BPP_YV12 = 12, + FOURCC_BPP_YV16 = 16, + FOURCC_BPP_YV24 = 24, + FOURCC_BPP_YU12 = 12, + FOURCC_BPP_J420 = 12, + FOURCC_BPP_J400 = 8, + FOURCC_BPP_H420 = 12, + FOURCC_BPP_H422 = 16, + FOURCC_BPP_I010 = 15, + FOURCC_BPP_I210 = 20, + FOURCC_BPP_H010 = 15, + FOURCC_BPP_H210 = 20, + FOURCC_BPP_P010 = 15, + FOURCC_BPP_P210 = 20, + FOURCC_BPP_MJPG = 0, // 0 means unknown. + FOURCC_BPP_H264 = 0, + FOURCC_BPP_IYUV = 12, + FOURCC_BPP_YU16 = 16, + FOURCC_BPP_YU24 = 24, + FOURCC_BPP_YUYV = 16, + FOURCC_BPP_YUVS = 16, + FOURCC_BPP_HDYC = 16, + FOURCC_BPP_2VUY = 16, + FOURCC_BPP_JPEG = 1, + FOURCC_BPP_DMB1 = 1, + FOURCC_BPP_BA81 = 8, + FOURCC_BPP_RGB3 = 24, + FOURCC_BPP_BGR3 = 24, + FOURCC_BPP_CM32 = 32, + FOURCC_BPP_CM24 = 24, + + // Match any fourcc. + FOURCC_BPP_ANY = 0, // 0 means unknown. +}; + +// Converts fourcc aliases into canonical ones. +LIBYUV_API uint32_t CanonicalFourCC(uint32_t fourcc); + +#endif // INCLUDE_LIBYUV_VIDEO_COMMON_H_ \ No newline at end of file diff --git a/pkg/encoder/yuv/yuv.c b/pkg/encoder/yuv/yuv.c deleted file mode 100644 index c4d918dc5..000000000 --- a/pkg/encoder/yuv/yuv.c +++ /dev/null @@ -1,130 +0,0 @@ -#include "yuv.h" - -#define Y601_STUDIO 1 - -// BT.601 STUDIO - -#ifdef Y601_STUDIO -// 66*R+129*G+25*B -static __inline int Y(uint8_t *__restrict rgb) { - int R = *rgb; - int G = *(rgb+1); - int B = *(rgb+2); - return (66*R+129*G+25*B+128)>>8; -} - -// 112*B-38*R-74G -static __inline int U(uint8_t *__restrict rgb) { - int R = *rgb; - int G = *(rgb+1); - int B = *(rgb+2); - return (-38*R-74*G+112*B+128) >> 8; -} - -// 112*R-94*G-18*B -static __inline int V(uint8_t *__restrict rgb) { - int R = 56**(rgb); - int G = 47**(rgb+1); - int B = *(rgb+2); - return (R-G-(B+(B<<3))+64) >> 7; -} - -static const int Y_MIN = 16; - -#else - -// BT.601 FULL - -// 77*R+150*G+29*B -static __inline int Y(uint8_t *rgb) { - int R = 77**(rgb); - int G = 150**(rgb+1); - int B = 29**(rgb+2); - return (R+G+B+128) >> 8; -} - -// 127*B-43*R-84*G -static __inline int U(uint8_t *rgb) { - int R = 43**(rgb); - int G = 84**(rgb+1); - int B = 127**(rgb+2); - return (-R-G+B+128) >> 8; -} - -// 127*R-106*G-21*B -static __inline int V(uint8_t *rgb) { - int R = 127**rgb; - int G = -106**(rgb+1); - int B = -21**(rgb+2); - return (G+B+R+128) >> 8; -} - -static const int Y_MIN = 0; -#endif - -static __inline void _y(uint8_t *__restrict p, uint8_t *__restrict y, int size) { - do { - *y++ = Y(p) + Y_MIN; - p += 4; - } while (--size); -} - -// It will take an average color from the 2x2 pixel group for chroma values. -// X X X X -// O O -// X X X X -static __inline void _4uv(uint8_t * __restrict p, uint8_t * __restrict u, uint8_t * __restrict v, const int w, const int h) { - uint8_t *p2, *p3, *p4; - const int row = w << 2; - const int next = 4; - - int x = w, y = h, sumU = 0, sumV = 0; - while (y > 0) { - while (x > 0) { - // xx.. - // .... - p2 = p+next; - sumU = U(p) + U(p2); - sumV = V(p) + V(p2); - // .... - // xx.. - p3 = p+row; - p4 = p3+next; - sumU += U(p3) + U(p4); - sumV += V(p3) + V(p4); - *u++ = 128 + (sumU >> 2); - *v++ = 128 + (sumV >> 2); - // ..x. - p += 8; - x -= 2; - } - p += row; - y -= 2; - x = w; - } -} - -// Converts RGBA image to YUV (I420) with BT.601 studio color range. -void rgbaToYuv(void *__restrict destination, void *__restrict source, const int w, const int h) { - const int image_size = w * h; - uint8_t *src = source; - uint8_t *dst_y = destination; - uint8_t *dst_u = destination + image_size; - uint8_t *dst_v = destination + image_size + image_size / 4; - _y(src, dst_y, image_size); - src = source; - _4uv(source, dst_u, dst_v, w, h); -} - -void luma(void *__restrict destination, void *__restrict source, const int pos, const int w, const int h) { - uint8_t *rgba = source + 4 * pos; - uint8_t *dst = destination + pos; - _y(rgba, dst, w*h); -} - -void chroma(void *__restrict dst, void *__restrict source, const int pos, const int deu, const int dev, const int w, const int h) { - uint8_t *src = source + 4 * pos; - uint8_t *dst_u = dst + deu + pos / 4; - uint8_t *dst_v = dst + dev + pos / 4; - _4uv(src, dst_u, dst_v, w, h); -} diff --git a/pkg/encoder/yuv/yuv.go b/pkg/encoder/yuv/yuv.go index 19a33318a..82f59ea78 100644 --- a/pkg/encoder/yuv/yuv.go +++ b/pkg/encoder/yuv/yuv.go @@ -3,123 +3,80 @@ package yuv import ( "image" "sync" - "unsafe" -) -/* -#cgo CFLAGS: -Wall -#include "yuv.h" -*/ -import "C" + "github.com/giongto35/cloud-game/v3/pkg/encoder/yuv/libyuv" +) -type ImgProcessor interface { - Process(rgba *image.RGBA) []byte - Put(*[]byte) +type Conv struct { + w, h int + sw, sh int + scale float64 + pool sync.Pool } -type Options struct { - Threads int +type RawFrame struct { + Data []byte + Stride int + W, H int } -type processor struct { - w, h int - - // cache - ww C.int - pool sync.Pool -} +type PixFmt uint32 -type threadedProcessor struct { - *processor +const FourccRgbp = libyuv.FourccRgbp +const FourccArgb = libyuv.FourccArgb +const FourccAbgr = libyuv.FourccAbgr - // threading - threads int - chunk int - - // cache - chromaU C.int - chromaV C.int - wg sync.WaitGroup +func NewYuvConv(w, h int, scale float64) Conv { + if scale < 1 { + scale = 1 + } + sw, sh := round(w, scale), round(h, scale) + bufSize := int(float64(sw) * float64(sh) * 1.5) + return Conv{ + w: w, h: h, sw: sw, sh: sh, scale: scale, + pool: sync.Pool{New: func() any { b := make([]byte, bufSize); return &b }}, + } } -// NewYuvImgProcessor creates new YUV image converter from RGBA. -func NewYuvImgProcessor(w, h int, opts *Options) ImgProcessor { - bufSize := int(float32(w*h) * 1.5) +// Process converts an image to YUV I420 format inside the internal buffer. +func (c *Conv) Process(frame RawFrame, rot uint, pf PixFmt) []byte { + dx, dy := c.w, c.h // dest + cx, cy := c.w, c.h // crop + if rot == 90 || rot == 270 { + cx, cy = cy, cx + } - processor := processor{ - w: w, - h: h, - ww: C.int(w), - pool: sync.Pool{New: func() any { - b := make([]byte, bufSize) - return &b - }}, + stride := frame.Stride >> 2 + if pf == PixFmt(libyuv.FourccRgbp) { + stride = frame.Stride >> 1 } - if opts != nil && opts.Threads > 0 { - // chunks the image evenly - chunk := h / opts.Threads - if chunk%2 != 0 { - chunk-- - } + buf := *c.pool.Get().(*[]byte) + libyuv.Y420(frame.Data, buf, frame.W, frame.H, stride, dx, dy, rot, uint32(pf), cx, cy) - return &threadedProcessor{ - chromaU: C.int(w * h), - chromaV: C.int(w*h + w*h/4), - chunk: chunk, - processor: &processor, - threads: opts.Threads, - wg: sync.WaitGroup{}, - } + if c.scale > 1 { + dstBuf := *c.pool.Get().(*[]byte) + libyuv.Y420Scale(buf, dstBuf, dx, dy, c.sw, c.sh) + c.pool.Put(&buf) + return dstBuf } - return &processor -} - -// Process converts RGBA colorspace into YUV I420 format inside the internal buffer. -// Non-threaded version. -func (yuv *processor) Process(rgba *image.RGBA) []byte { - buf := *yuv.pool.Get().(*[]byte) - C.rgbaToYuv(unsafe.Pointer(&buf[0]), unsafe.Pointer(&rgba.Pix[0]), yuv.ww, C.int(yuv.h)) return buf } -func (yuv *processor) Put(x *[]byte) { yuv.pool.Put(x) } +func (c *Conv) Put(x *[]byte) { c.pool.Put(x) } +func (c *Conv) Version() string { return libyuv.Version() } +func round(x int, scale float64) int { return (int(float64(x)*scale) + 1) & ^1 } -// Process converts RGBA colorspace into YUV I420 format inside the internal buffer. -// Threaded version. -// -// We divide the input image into chunks by the number of available CPUs. -// Each chunk should contain 2, 4, 6, etc. rows of the image. -// -// 8x4 CPU (2) -// x x x x x x x x | Coroutine 1 -// x x x x x x x x | Coroutine 1 -// x x x x x x x x | Coroutine 2 -// x x x x x x x x | Coroutine 2 -func (yuv *threadedProcessor) Process(rgba *image.RGBA) []byte { - src := unsafe.Pointer(&rgba.Pix[0]) - buf := *yuv.pool.Get().(*[]byte) - dst := unsafe.Pointer(&buf[0]) - yuv.wg.Add(yuv.threads << 1) - chunk := yuv.w * yuv.chunk - for i := 0; i < yuv.threads; i++ { - pos, hh := C.int(i*chunk), C.int(yuv.chunk) - if i == yuv.threads-1 { - hh = C.int(yuv.h - i*yuv.chunk) - } - go yuv.chroma_(src, dst, pos, hh) - go yuv.luma_(src, dst, pos, hh) - } - yuv.wg.Wait() - return buf -} +func ToYCbCr(bytes []byte, w, h int) *image.YCbCr { + cw, ch := (w+1)/2, (h+1)/2 -func (yuv *threadedProcessor) luma_(src unsafe.Pointer, dst unsafe.Pointer, pos C.int, hh C.int) { - C.luma(dst, src, pos, yuv.ww, hh) - yuv.wg.Done() -} + i0 := w*h + 0*cw*ch + i1 := w*h + 1*cw*ch + i2 := w*h + 2*cw*ch -func (yuv *threadedProcessor) chroma_(src unsafe.Pointer, dst unsafe.Pointer, pos C.int, hh C.int) { - C.chroma(dst, src, pos, yuv.chromaU, yuv.chromaV, yuv.ww, hh) - yuv.wg.Done() + yuv := image.NewYCbCr(image.Rect(0, 0, w, h), image.YCbCrSubsampleRatio420) + yuv.Y = bytes[:i0:i0] + yuv.Cb = bytes[i0:i1:i1] + yuv.Cr = bytes[i1:i2:i2] + return yuv } diff --git a/pkg/encoder/yuv/yuv.h b/pkg/encoder/yuv/yuv.h deleted file mode 100644 index 6b39ec521..000000000 --- a/pkg/encoder/yuv/yuv.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef YUV_H__ -#define YUV_H__ - -#include - -// Converts RGBA image to YUV (I420) with BT.601 studio color range. -void rgbaToYuv(void *destination, void *source, int width, int height); - -// Converts RGBA image chunk to YUV (I420) chroma with BT.601 studio color range. -// pos contains a shift value for chunks. -// deu, dev contains constant shifts for U, V planes in the resulting array. -// chroma (0, 1) selects chroma estimation algorithm. -void chroma(void *destination, void *source, int pos, int deu, int dev, int width, int height); - -// Converts RGBA image chunk to YUV (I420) luma with BT.601 studio color range. -void luma(void *destination, void *source, int pos, int width, int height); - -#endif diff --git a/pkg/encoder/yuv/yuv_test.go b/pkg/encoder/yuv/yuv_test.go index fbf53efe8..6b67c29f0 100644 --- a/pkg/encoder/yuv/yuv_test.go +++ b/pkg/encoder/yuv/yuv_test.go @@ -1,213 +1,188 @@ package yuv import ( + "archive/zip" "fmt" "image" "image/color" "image/png" + "io" "math" "math/rand" "os" - "reflect" - "runtime" + "path/filepath" "testing" - "time" -) - -func TestYuv(t *testing.T) { - size1, size2 := 32, 32 - for i := 1; i < 100; i++ { - img := generateImage(size1, size2, randomColor()) - pc := NewYuvImgProcessor(size1, size2, new(Options)) - pct := NewYuvImgProcessor(size1, size2, &Options{Threads: runtime.NumCPU()}) - - a := pc.Process(img) - b := pct.Process(img) - if !reflect.DeepEqual(a, b) { - t.Fatalf("couldn't convert %v, \n %v \n %v", img.Pix, a, b) - } - } -} + "github.com/giongto35/cloud-game/v3/pkg/encoder/yuv/libyuv" + _ "github.com/giongto35/cloud-game/v3/test" +) func TestYuvPredefined(t *testing.T) { im := []uint8{101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255} should := []byte{} - pc := NewYuvImgProcessor(32, 32, new(Options)) - pct := NewYuvImgProcessor(32, 32, &Options{Threads: runtime.NumCPU()}) - - img := image.NewRGBA(image.Rect(0, 0, 32, 32)) - img.Pix = im + pc := NewYuvConv(32, 32, 1) + frame := RawFrame{Data: im, Stride: 32, W: 32, H: 32} + a := pc.Process(frame, 0, PixFmt(libyuv.FourccAbgr)) - a := pc.Process(img) - b := pct.Process(img) - - if len(a) != len(b) || len(a) != len(should) || len(b) != len(should) { - t.Fatalf("diffrent size a: %v, b: %v, o: %v", len(a), len(b), len(should)) + if len(a) != len(should) { + t.Fatalf("diffrent size a: %v, o: %v", len(a), len(should)) } for i := 0; i < len(a); i++ { - if a[i] != b[i] || a[i] != should[i] || b[i] != should[i] { - t.Fatalf("diff in %vth, %v != %v != %v \n%v\n%v", i, a[i], b[i], should[i], im, should) + if a[i] != should[i] { + t.Fatalf("diff in %vth, %v != %v \n%v\n%v", i, a[i], should[i], im, should) } } } -func generateImage(w, h int, color color.RGBA) *image.RGBA { - img := image.NewRGBA(image.Rect(0, 0, w, h)) - for x := 0; x < w; x++ { - for y := 0; y < h; y++ { - img.Set(x, y, color) - } +func TestYuvScale(t *testing.T) { + name := "001_alsa_ABGR_256_240_1024.raw" + path := filepath.Join("./test/testdata/raw/", name) + + data, err := ReadZip(path + ".zip") + if err != nil { + t.Error(err) } - return img -} -func randomColor() color.RGBA { - rnd := rand.New(rand.NewSource(time.Now().Unix())) - return color.RGBA{ - R: uint8(rnd.Intn(255)), - G: uint8(rnd.Intn(255)), - B: uint8(rnd.Intn(255)), - A: 255, + pf, w, h, stride := PixFmt(libyuv.FourccArgb), 256, 240, 1024 + scale := 2 + + conv := NewYuvConv(w, h, float64(scale)) + frame := RawFrame{Data: data, Stride: stride, W: w, H: h} + out := conv.Process(frame, 0, pf) + + d := float64(len(out)) / float64(len(data)) + if d != 1.5 { + t.Errorf("Scaled not by factor %v, %v", scale, d) } + + // save as RGBA + //sw, sh := w*scale, h*scale + //yuv := ToYCbCr(out, sw, sh) + //if f, err := os.Create(filepath.Join("./", name+".png")); err == nil { + // if err = png.Encode(f, yuv); err != nil { + // t.Logf("Couldn't encode the image, %v", err) + // } + // _ = f.Close() + //} } -func BenchmarkYUV(b *testing.B) { - cpu := runtime.NumCPU() +func BenchmarkYuv(b *testing.B) { tests := []struct { - cpu int - w int - h int + w int + h int }{ - {cpu: cpu * 0, w: 1920, h: 1080}, - {cpu: cpu * 2, w: 1920, h: 1080}, - {cpu: cpu * 4, w: 1920, h: 1080}, - {cpu: cpu * 0, w: 320, h: 240}, - {cpu: cpu * 2, w: 320, h: 240}, - {cpu: cpu * 4, w: 320, h: 240}, + {w: 1920, h: 1080}, + {w: 320, h: 240}, } - for _, bn := range tests { - b.Run(fmt.Sprintf("%d-%vx%v", bn.cpu, bn.w, bn.h), func(b *testing.B) { - _processYUV(bn.w, bn.h, bn.cpu, b) - }) - } -} - -func BenchmarkYUVReference(b *testing.B) { _processYUV(1920, 1080, 0, b) } - -func _processYUV(w, h, cpu int, b *testing.B) { - b.StopTimer() - r1 := rand.New(rand.NewSource(int64(1))).Float32() - r2 := rand.New(rand.NewSource(int64(2))).Float32() - pc := NewYuvImgProcessor(w, h, &Options{Threads: cpu}) - - image1 := genTestImage(w, h, r1) - image2 := genTestImage(w, h, r2) - - for i := 0; i < b.N; i++ { - im := image1 - if i%2 == 0 { - im = image2 - } - b.StartTimer() - pc.Process(im) - b.StopTimer() - b.SetBytes(int64(len(im.Pix))) + for _, test := range tests { + w, h := test.w, test.h + frame := genFrame(w, h, r1) + b.Run(fmt.Sprintf("%vx%v YUV", w, h), func(b *testing.B) { + pc := NewYuvConv(w, h, 1) + for i := 0; i < b.N; i++ { + pc.Process(frame, 0, PixFmt(libyuv.FourccAbgr)) + b.SetBytes(int64(len(frame.Data))) + } + b.ReportAllocs() + }) } - b.ReportAllocs() } -func genTestImage(w, h int, seed float32) *image.RGBA { +func genFrame(w, h int, seed float32) RawFrame { img := image.NewRGBA(image.Rectangle{Max: image.Point{X: w, Y: h}}) for x := 0; x < w; x++ { for y := 0; y < h; y++ { @@ -215,7 +190,12 @@ func genTestImage(w, h int, seed float32) *image.RGBA { img.Set(x, y, col) } } - return img + return RawFrame{ + Data: img.Pix, + Stride: img.Stride, + W: img.Bounds().Dx(), + H: img.Bounds().Dy(), + } } func TestGen24bitFull(t *testing.T) { @@ -282,3 +262,19 @@ func hsb2rgb(hue, s, bri float64) (r, g, b int) { } return } + +func ReadZip(path string) ([]byte, error) { + zf, err := zip.OpenReader(path) + if err != nil { + return nil, err + } + defer func() { _ = zf.Close() }() + + f, err := zf.File[0].Open() + if err != nil { + return nil, err + } + defer func() { _ = f.Close() }() + + return io.ReadAll(f) +} diff --git a/pkg/worker/caged/app/app.go b/pkg/worker/caged/app/app.go index 82fff885e..a1917b4d5 100644 --- a/pkg/worker/caged/app/app.go +++ b/pkg/worker/caged/app/app.go @@ -1,7 +1,5 @@ package app -import "image" - type App interface { AudioSampleRate() int Init() error @@ -20,6 +18,12 @@ type Audio struct { } type Video struct { - Frame image.RGBA + Frame RawFrame Duration int32 } + +type RawFrame struct { + Data []byte + Stride int + W, H int +} diff --git a/pkg/worker/caged/libretro/caged.go b/pkg/worker/caged/libretro/caged.go index 57ea82d8d..de0ba038d 100644 --- a/pkg/worker/caged/libretro/caged.go +++ b/pkg/worker/caged/libretro/caged.go @@ -50,8 +50,7 @@ func (c *Caged) Load(game games.GameMetadata, path string) error { return err } w, h := c.ViewportCalc() - c.SetViewport(w, h, c.conf.Emulator.Scale) - + c.SetViewport(w, h) return nil } @@ -75,8 +74,11 @@ func (c *Caged) EnableCloudStorage(uid string, storage cloud.Storage) { } } +func (c *Caged) PixFormat() uint32 { return c.Emulator.PixFormat() } +func (c *Caged) Rotation() uint { return c.Emulator.Rotation() } func (c *Caged) AudioSampleRate() int { return c.Emulator.AudioSampleRate() } func (c *Caged) ViewportSize() (int, int) { return c.Emulator.ViewportSize() } +func (c *Caged) Scale() float64 { return c.Emulator.Scale() } func (c *Caged) SendControl(port int, data []byte) { c.base.Input(port, data) } func (c *Caged) Start() { go c.Emulator.Start() } func (c *Caged) SetSaveOnClose(v bool) { c.base.SaveOnClose = v } diff --git a/pkg/worker/caged/libretro/frontend.go b/pkg/worker/caged/libretro/frontend.go index bf7651b4c..63d323201 100644 --- a/pkg/worker/caged/libretro/frontend.go +++ b/pkg/worker/caged/libretro/frontend.go @@ -14,7 +14,6 @@ import ( "github.com/giongto35/cloud-game/v3/pkg/logger" "github.com/giongto35/cloud-game/v3/pkg/os" "github.com/giongto35/cloud-game/v3/pkg/worker/caged/app" - "github.com/giongto35/cloud-game/v3/pkg/worker/caged/libretro/image" "github.com/giongto35/cloud-game/v3/pkg/worker/caged/libretro/nanoarch" ) @@ -25,12 +24,14 @@ type Emulator interface { LoadGame(path string) error FPS() int Flipped() bool + Rotation() uint + PixFormat() uint32 AudioSampleRate() int IsPortrait() bool // Start is called after LoadGame Start() // SetViewport sets viewport size - SetViewport(width int, height int, scale int) + SetViewport(width int, height int) // ViewportCalc calculates the viewport size with the aspect ratio and scale ViewportCalc() (nw int, nh int) ViewportSize() (w, h int) @@ -48,10 +49,11 @@ type Emulator interface { ToggleMultitap() // Input passes input to the emulator Input(player int, data []byte) + // Scale returns set video scale factor + Scale() float64 } type Frontend struct { - canvas *image.Canvas conf config.Emulator done chan struct{} input InputState @@ -60,6 +62,7 @@ type Frontend struct { onAudio func(app.Audio) onVideo func(app.Video) storage Storage + scale float64 th int // draw threads vw, vh int // out frame size @@ -151,6 +154,12 @@ func (f *Frontend) LoadCore(emu string) { UsesLibCo: conf.UsesLibCo, } f.mu.Lock() + scale := 1.0 + if conf.Scale > 1 { + scale = conf.Scale + f.log.Debug().Msgf("Scale: x%v", scale) + } + f.scale = scale f.nano.CoreLoad(meta) f.mu.Unlock() } @@ -169,30 +178,27 @@ func (f *Frontend) handleAudio(audio unsafe.Pointer, samples int) { } func (f *Frontend) handleVideo(data []byte, delta int32, fi nanoarch.FrameInfo) { - pixFmt := f.nano.Video.PixFmt - bpp := int(f.nano.Video.BPP) - drawn := f.canvas.Draw(pixFmt, f.nano.Rot, int(fi.W), int(fi.H), int(fi.Packed), bpp, data, f.th) - + // !to merge both pools fr, _ := videoPool.Get().(*app.Video) if fr == nil { fr = new(app.Video) } - fr.Frame = drawn.Unwrap() + fr.Frame.Data = data + fr.Frame.W = int(fi.W) + fr.Frame.H = int(fi.H) + fr.Frame.Stride = int(fi.Stride) fr.Duration = delta f.onVideo(*fr) - f.canvas.Put(drawn) videoPool.Put(fr) } func (f *Frontend) Shutdown() { - f.log.Debug().Msgf("run loop cleanup") f.mu.Lock() f.nano.Shutdown() - f.canvas.Clear() f.SetAudioCb(noAudio) f.SetVideoCb(noVideo) f.mu.Unlock() - f.log.Debug().Msgf("run loop finished") + f.log.Debug().Msgf("frontend closed") } func (f *Frontend) linkNano(nano *nanoarch.Nanoarch) { @@ -240,6 +246,8 @@ func (f *Frontend) Start() { } } +func (f *Frontend) PixFormat() uint32 { return f.nano.Video.PixFmt.C } +func (f *Frontend) Rotation() uint { return f.nano.Rot } func (f *Frontend) Flipped() bool { return f.nano.IsGL() } func (f *Frontend) FrameSize() (int, int) { return f.nano.GeometryBase() } func (f *Frontend) FPS() int { return f.nano.VideoFramerate() } @@ -250,21 +258,15 @@ func (f *Frontend) AudioSampleRate() int { return f.nano.AudioSampleRat func (f *Frontend) Input(player int, data []byte) { f.input.setInput(player, data) } func (f *Frontend) LoadGame(path string) error { return f.nano.LoadGame(path) } func (f *Frontend) RestoreGameState() error { return f.Load() } +func (f *Frontend) Scale() float64 { return f.scale } func (f *Frontend) IsPortrait() bool { return f.nano.IsPortrait() } func (f *Frontend) SaveGameState() error { return f.Save() } -func (f *Frontend) Scale(factor int) { w, h := f.ViewportSize(); f.SetViewport(w, h, factor) } func (f *Frontend) SetAudioCb(cb func(app.Audio)) { f.onAudio = cb } func (f *Frontend) SetSessionId(name string) { f.storage.SetMainSaveName(name) } func (f *Frontend) SetVideoCb(ff func(app.Video)) { f.onVideo = ff } -func (f *Frontend) SetViewport(width int, height int, scale int) { +func (f *Frontend) SetViewport(width int, height int) { f.mu.Lock() f.vw, f.vh = width, height - mw, mh := f.nano.GeometryMax() - size := mw * scale * mh * scale - f.canvas = image.NewCanvas(width, height, size) - if f.DisableCanvasPool { - f.canvas.SetEnabled(false) - } f.mu.Unlock() } @@ -292,14 +294,9 @@ func (f *Frontend) ViewportCalc() (nw int, nh int) { nw, nh = w, h } - if f.conf.Scale > 1 { - nw, nh = nw*f.conf.Scale, nh*f.conf.Scale - f.log.Debug().Msgf("Viewport size scaled: %dx%d", nw, nh) - } - if f.IsPortrait() { nw, nh = nh, nw - f.log.Debug().Msgf("Viewport was flipped") + f.log.Debug().Msgf("Set portrait mode") } f.log.Info().Msgf("Viewport final size: %dx%d", nw, nh) diff --git a/pkg/worker/caged/libretro/frontend_test.go b/pkg/worker/caged/libretro/frontend_test.go index afedbd539..60a08dee8 100644 --- a/pkg/worker/caged/libretro/frontend_test.go +++ b/pkg/worker/caged/libretro/frontend_test.go @@ -7,18 +7,27 @@ import ( "log" "math/rand" "os" - "path" "path/filepath" "sync" "testing" - "unsafe" "github.com/giongto35/cloud-game/v3/pkg/config" "github.com/giongto35/cloud-game/v3/pkg/logger" "github.com/giongto35/cloud-game/v3/pkg/worker/caged/app" + "github.com/giongto35/cloud-game/v3/pkg/worker/caged/libretro/manager" "github.com/giongto35/cloud-game/v3/pkg/worker/caged/libretro/nanoarch" + "github.com/giongto35/cloud-game/v3/pkg/worker/thread" + + _ "github.com/giongto35/cloud-game/v3/test" ) +type TestFrontend struct { + *Frontend + + corePath string + gamePath string +} + type testRun struct { room string system string @@ -26,46 +35,48 @@ type testRun struct { emulationTicks int } -// EmulatorMock contains Frontend mocking data. -type EmulatorMock struct { - *Frontend - - // Libretro compiled lib core name - core string - // shared core paths (can't be changed) - paths EmulatorPaths +type game struct { + rom string + system string } -// EmulatorPaths defines various emulator file paths. -type EmulatorPaths struct { - assets string - cores string - games string - save string +var ( + alwa = game{system: "nes", rom: "Alwa's Awakening (Demo).nes"} + sushi = game{system: "gba", rom: "Sushi The Cat.gba"} + angua = game{system: "gba", rom: "anguna.gba"} +) + +// TestMain runs all tests in the main thread in macOS. +func TestMain(m *testing.M) { + thread.Wrap(func() { os.Exit(m.Run()) }) } -// GetEmulatorMock returns a properly stubbed emulator instance. +// EmulatorMock returns a properly stubbed emulator instance. // Due to extensive use of globals -- one mock instance is allowed per a test run. // Don't forget to init one image channel consumer, it will lock-out otherwise. // Make sure you call Shutdown(). -func GetEmulatorMock(room string, system string) *EmulatorMock { - rootPath := getRootPath() - +func EmulatorMock(room string, system string) *TestFrontend { var conf config.WorkerConfig if _, err := config.LoadConfig(&conf, ""); err != nil { panic(err) } - meta := conf.Emulator.GetLibretroCoreConfig(system) - - nano := nanoarch.NewNano(cleanPath(conf.Emulator.LocalPath)) + conf.Emulator.Libretro.Cores.Repo.ExtLock = expand("tests", ".cr", "cloud-game.lock") + conf.Emulator.LocalPath = expand("tests", conf.Emulator.LocalPath) + conf.Emulator.Storage = expand("tests", "storage") l := logger.Default() l2 := l.Extend(l.Level(logger.ErrorLevel).With()) + + if err := manager.CheckCores(conf.Emulator, l); err != nil { + panic(err) + } + + nano := nanoarch.NewNano(conf.Emulator.LocalPath) nano.SetLogger(l2) // an emu - emu := &EmulatorMock{ + emu := &TestFrontend{ Frontend: &Frontend{ conf: conf.Emulator, storage: &StateStorage{ @@ -78,27 +89,19 @@ func GetEmulatorMock(room string, system string) *EmulatorMock { log: l2, SaveOnClose: false, }, - - core: path.Base(meta.Lib), - - paths: EmulatorPaths{ - assets: cleanPath(rootPath), - cores: cleanPath(rootPath + "assets/cores/"), - games: cleanPath(rootPath + "assets/games/"), - }, + corePath: expand(conf.Emulator.GetLibretroCoreConfig(system).Lib), + gamePath: expand(conf.Worker.Library.BasePath), } emu.linkNano(nano) - emu.paths.save = cleanPath(emu.HashPath()) - return emu } -// GetDefaultFrontend returns initialized emulator mock with default params. +// DefaultFrontend returns initialized emulator mock with default params. // Spawns audio/image channels consumers. // Don't forget to close emulator mock with Shutdown(). -func GetDefaultFrontend(room string, system string, rom string) *EmulatorMock { - mock := GetEmulatorMock(room, system) +func DefaultFrontend(room string, system string, rom string) *TestFrontend { + mock := EmulatorMock(room, system) mock.loadRom(rom) mock.SetVideoCb(func(app.Video) {}) mock.SetAudioCb(func(app.Audio) {}) @@ -107,25 +110,30 @@ func GetDefaultFrontend(room string, system string, rom string) *EmulatorMock { // loadRom loads a ROM into the emulator. // The rom will be loaded from emulators' games path. -func (emu *EmulatorMock) loadRom(game string) { - fmt.Printf("%v %v\n", emu.paths.cores, emu.core) - emu.nano.CoreLoad(nanoarch.Metadata{LibPath: emu.paths.cores + emu.core}) - err := emu.nano.LoadGame(emu.paths.games + game) +func (emu *TestFrontend) loadRom(game string) { + emu.nano.CoreLoad(nanoarch.Metadata{LibPath: emu.corePath}) + + gamePath := expand(emu.gamePath, game) + + conf := emu.conf.GetLibretroCoreConfig(gamePath) + scale := 1.0 + if conf.Scale > 1 { + scale = conf.Scale + } + emu.scale = scale + + err := emu.nano.LoadGame(gamePath) if err != nil { log.Fatal(err) } w, h := emu.FrameSize() - if emu.conf.Scale == 0 { - emu.conf.Scale = 1 - } - emu.SetViewport(w, h, emu.conf.Scale) + emu.SetViewport(w, h) } // Shutdown closes the emulator and cleans its resources. -func (emu *EmulatorMock) Shutdown() { +func (emu *TestFrontend) Shutdown() { _ = os.Remove(emu.HashPath()) _ = os.Remove(emu.SRAMPath()) - emu.Frontend.Close() emu.Frontend.Shutdown() } @@ -133,97 +141,56 @@ func (emu *EmulatorMock) Shutdown() { // dumpState returns the current emulator state and // the latest saved state for its session. // Locks the emulator. -func (emu *EmulatorMock) dumpState() (string, string) { +func (emu *TestFrontend) dumpState() (string, string) { emu.mu.Lock() - bytes, _ := os.ReadFile(emu.paths.save) - persistedStateHash := getHash(bytes) + bytes, _ := os.ReadFile(emu.HashPath()) + lastStateHash := hash(bytes) emu.mu.Unlock() - stateHash := emu.getStateHash() - fmt.Printf("mem: %v, dat: %v\n", stateHash, persistedStateHash) - return stateHash, persistedStateHash -} - -// getStateHash returns the current emulator state hash. -// Locks the emulator. -func (emu *EmulatorMock) getStateHash() string { emu.mu.Lock() state, _ := nanoarch.SaveState() emu.mu.Unlock() + stateHash := hash(state) - return getHash(state) + fmt.Printf("mem: %v, dat: %v\n", stateHash, lastStateHash) + return stateHash, lastStateHash } -// getRootPath returns absolute path to the root directory. -func getRootPath() string { - p, _ := filepath.Abs("../../../../") - return p + string(filepath.Separator) -} - -// getHash returns MD5 hash. -func getHash(bytes []byte) string { return fmt.Sprintf("%x", md5.Sum(bytes)) } - -// cleanPath returns a proper file path for current OS. -func cleanPath(path string) string { return filepath.FromSlash(path) } - -// benchmarkEmulator is a generic function for -// measuring emulator performance for one emulation frame. -func benchmarkEmulator(system string, rom string, b *testing.B) { - b.StopTimer() +func BenchmarkEmulators(b *testing.B) { log.SetOutput(io.Discard) os.Stdout, _ = os.Open(os.DevNull) - s := GetDefaultFrontend("bench_"+system+"_performance", system, rom) - - b.StartTimer() - for i := 0; i < b.N; i++ { - s.nano.Run() + benchmarks := []struct { + name string + system string + rom string + }{ + {name: "GBA Sushi", system: sushi.system, rom: sushi.rom}, + {name: "NES Alwa", system: alwa.system, rom: alwa.rom}, } - s.Shutdown() -} - -func BenchmarkEmulatorGba(b *testing.B) { - benchmarkEmulator("gba", "Sushi The Cat.gba", b) -} -func BenchmarkEmulatorNes(b *testing.B) { - benchmarkEmulator("nes", "Alwa's Awakening (Demo).nes", b) -} - -func TestSwap(t *testing.T) { - data := []byte{1, 254, 255, 32} - pixel := *(*uint32)(unsafe.Pointer(&data[0])) - // 0 1 2 3 - // 2 1 0 3 - ll := ((pixel >> 16) & 0xff) | (pixel & 0xff00) | ((pixel << 16) & 0xff0000) | 0xff000000 - - rez := []byte{0, 0, 0, 0} - *(*uint32)(unsafe.Pointer(&rez[0])) = ll - - log.Printf("%v\n%v", data, rez) + for _, bench := range benchmarks { + b.Run(bench.name, func(b *testing.B) { + s := DefaultFrontend("bench_"+bench.system+"_performance", bench.system, bench.rom) + for i := 0; i < b.N; i++ { + s.nano.Run() + } + s.Shutdown() + }) + } } // Tests a successful emulator state save. func TestSave(t *testing.T) { tests := []testRun{ - { - room: "test_save_ok_00", - system: "gba", - rom: "Sushi The Cat.gba", - emulationTicks: 100, - }, - { - room: "test_save_ok_01", - system: "gba", - rom: "anguna.gba", - emulationTicks: 10, - }, + {room: "test_save_ok_00", system: sushi.system, rom: sushi.rom, emulationTicks: 100}, + {room: "test_save_ok_01", system: angua.system, rom: angua.rom, emulationTicks: 10}, } for _, test := range tests { t.Logf("Testing [%v] save with [%v]\n", test.system, test.rom) - front := GetDefaultFrontend(test.room, test.system, test.rom) + front := DefaultFrontend(test.room, test.system, test.rom) for test.emulationTicks > 0 { front.Tick() @@ -255,30 +222,15 @@ func TestSave(t *testing.T) { // Compare states (a) and (b), should be =. func TestLoad(t *testing.T) { tests := []testRun{ - { - room: "test_load_00", - system: "nes", - rom: "Alwa's Awakening (Demo).nes", - emulationTicks: 100, - }, - { - room: "test_load_01", - system: "gba", - rom: "Sushi The Cat.gba", - emulationTicks: 1000, - }, - { - room: "test_load_02", - system: "gba", - rom: "anguna.gba", - emulationTicks: 100, - }, + {room: "test_load_00", system: alwa.system, rom: alwa.rom, emulationTicks: 100}, + {room: "test_load_01", system: sushi.system, rom: sushi.rom, emulationTicks: 1000}, + {room: "test_load_02", system: angua.system, rom: angua.rom, emulationTicks: 100}, } for _, test := range tests { t.Logf("Testing [%v] load with [%v]\n", test.system, test.rom) - mock := GetDefaultFrontend(test.room, test.system, test.rom) + mock := DefaultFrontend(test.room, test.system, test.rom) fmt.Printf("[%-14v] ", "initial") mock.dumpState() @@ -317,26 +269,15 @@ func TestLoad(t *testing.T) { func TestStateConcurrency(t *testing.T) { tests := []struct { - run testRun - // determine random + run testRun seed int }{ { - run: testRun{ - room: "test_concurrency_00", - system: "gba", - rom: "Sushi The Cat.gba", - emulationTicks: 120, - }, + run: testRun{room: "test_concurrency_00", system: sushi.system, rom: sushi.rom, emulationTicks: 120}, seed: 42, }, { - run: testRun{ - room: "test_concurrency_01", - system: "gba", - rom: "anguna.gba", - emulationTicks: 300, - }, + run: testRun{room: "test_concurrency_01", system: angua.system, rom: angua.rom, emulationTicks: 300}, seed: 42 + 42, }, } @@ -344,7 +285,7 @@ func TestStateConcurrency(t *testing.T) { for _, test := range tests { t.Logf("Testing [%v] concurrency with [%v]\n", test.run.system, test.run.rom) - mock := GetEmulatorMock(test.run.room, test.run.system) + mock := EmulatorMock(test.run.room, test.run.system) ops := &sync.WaitGroup{} // quantum lock @@ -352,14 +293,14 @@ func TestStateConcurrency(t *testing.T) { mock.loadRom(test.run.rom) mock.SetVideoCb(func(v app.Video) { - if len(v.Frame.Pix) == 0 { + if len(v.Frame.Data) == 0 { t.Errorf("It seems that rom video frame was empty, which is strange!") } }) mock.SetAudioCb(func(app.Audio) {}) t.Logf("Random seed is [%v]\n", test.seed) - t.Logf("Save path is [%v]\n", mock.paths.save) + t.Logf("Save path is [%v]\n", mock.HashPath()) _ = mock.Save() @@ -404,36 +345,28 @@ func TestStateConcurrency(t *testing.T) { } } -// lucky returns random boolean. -func lucky() bool { return rand.Intn(2) == 1 } - func TestConcurrentInput(t *testing.T) { - players := NewGameSessionInput() - - events := 1000 var wg sync.WaitGroup + state := NewGameSessionInput() + events := 1000 + wg.Add(2 * events) - wg.Add(events * 2) - - go func() { - for i := 0; i < events; i++ { - player := rand.Intn(maxPort) - go func() { - players.setInput(player, []byte{0, 1}) - wg.Done() - }() - } - }() - - go func() { - for i := 0; i < events; i++ { - player := rand.Intn(maxPort) - go func() { - players.isKeyPressed(uint(player), 100) - wg.Done() - }() - } - }() - + for i := 0; i < events; i++ { + player := rand.Intn(maxPort) + go func() { state.setInput(player, []byte{0, 1}); wg.Done() }() + go func() { state.isKeyPressed(uint(player), 100); wg.Done() }() + } wg.Wait() } + +// expand joins a list of file path elements. +func expand(p ...string) string { + ph, _ := filepath.Abs(filepath.FromSlash(filepath.Join(p...))) + return ph +} + +// hash returns MD5 hash. +func hash(bytes []byte) string { return fmt.Sprintf("%x", md5.Sum(bytes)) } + +// lucky returns random boolean. +func lucky() bool { return rand.Intn(2) == 1 } diff --git a/pkg/worker/caged/libretro/image/canvas.c b/pkg/worker/caged/libretro/image/canvas.c deleted file mode 100644 index 037530146..000000000 --- a/pkg/worker/caged/libretro/image/canvas.c +++ /dev/null @@ -1,88 +0,0 @@ -#include "canvas.h" - -__inline xy rotate(int t, int x, int y, int w, int h) { - xy p = {x, y}; - switch (t) { - // 90° CCW or 270° CW - case A90: - p.x = y; - p.y = w - 1 - x; - break; - // 180° CCW - case A180: - p.x = w - 1 - x; - p.y = h - 1 - y; - break; - // 270° CCW or 90° CW - case A270: - p.x = h - 1 - y; - p.y = x; - break; - // flip Y - case F180: - //p.x = x; - p.y = h - 1 - y; - break; - } - return p; -} - -__inline uint32_t _565(uint32_t x) { - return ((x >> 8 & 0xf8) | ((x >> 3 & 0xfc) << 8) | ((x << 3 & 0xfc) << 16)); // | 0xff000000 -} - -__inline uint32_t _8888rev(uint32_t px) { - return (((px >> 16) & 0xff) | (px & 0xff00) | ((px << 16) & 0xff0000)); // | 0xff000000 -} - -void RGBA(int pix, uint32_t *__restrict dst, const void *__restrict source, int y, int h, int w, int hh, int dw, int pad, int rot) { - int x; - xy rxy; - const uint16_t *src16; - const uint32_t *src32; - - switch (pix) { - //case BIT_SHORT5551: - // break; - case BIT_INT_8888REV: - src32 = (const uint32_t *)source; - int pad32 = pad >> 2; - if (rot == NO_ROT) { - for (; y < h; ++y) { - for (x = 0; x < w; ++x) { - *dst++ = _8888rev(*src32++); - } - src32 += pad32; - } - } else { - for (; y < h; ++y) { - for (x = 0; x < w; ++x) { - rxy = rotate(rot, x, y, w, hh); - dst[rxy.x+rxy.y*dw] = _8888rev(*src32++); - } - src32 += pad32; - } - } - break; - case BIT_SHORT565: - src16 = (const uint16_t *)source; - int pad16 = pad >> 1; - if (rot == NO_ROT) { - for (; y < h; ++y) { - for (x = 0; x < w; ++x) { - *dst++ = _565(*src16++); - } - src16 += pad16; - } - } else { - for (; y < h; ++y) { - for (x = 0; x < w; ++x) { - rxy = rotate(rot, x, y, w, hh); - dst[rxy.x+rxy.y*dw] = _565(*src16++); - } - src16 += pad16; - } - } - break; - } -} diff --git a/pkg/worker/caged/libretro/image/canvas.go b/pkg/worker/caged/libretro/image/canvas.go deleted file mode 100644 index 31d9750b8..000000000 --- a/pkg/worker/caged/libretro/image/canvas.go +++ /dev/null @@ -1,159 +0,0 @@ -package image - -import ( - "image" - "sync" - "unsafe" - - "golang.org/x/image/draw" -) - -/* -#cgo CFLAGS: -Wall -#include "canvas.h" -*/ -import "C" - -// Canvas is a stateful drawing surface, i.e. image.RGBA -type Canvas struct { - enabled bool - w, h int - vertical bool - pool sync.Pool - wg sync.WaitGroup -} - -type Frame struct{ image.RGBA } - -func (f *Frame) Unwrap() image.RGBA { return f.RGBA } -func (f *Frame) Opaque() bool { return true } -func (f *Frame) Copy() Frame { - return Frame{image.RGBA{Pix: append([]uint8{}, f.Pix...), Stride: f.Stride, Rect: f.Rect}} -} - -const ( - BitFormatShort5551 = iota // BIT_FORMAT_SHORT_5_5_5_1 has 5 bits R, 5 bits G, 5 bits B, 1 bit alpha - BitFormatInt8888Rev // BIT_FORMAT_INT_8_8_8_8_REV has 8 bits R, 8 bits G, 8 bits B, 8 bit alpha - BitFormatShort565 // BIT_FORMAT_SHORT_5_6_5 has 5 bits R, 6 bits G, 5 bits -) - -const ( - ScaleNot = iota // skips image interpolation - ScaleNearestNeighbour // nearest neighbour interpolation - ScaleBilinear // bilinear interpolation -) - -func Resize(scaleType int, src *image.RGBA, out *image.RGBA) { - // !to do set it once instead switching on each iteration - switch scaleType { - case ScaleBilinear: - draw.ApproxBiLinear.Scale(out, out.Bounds(), src, src.Bounds(), draw.Src, nil) - case ScaleNot: - fallthrough - case ScaleNearestNeighbour: - fallthrough - default: - draw.NearestNeighbor.Scale(out, out.Bounds(), src, src.Bounds(), draw.Src, nil) - } -} - -type Rotation uint - -const ( - A90 Rotation = iota + 1 - A180 - A270 - F180 // F180 is flipped Y -) - -func NewCanvas(w, h, size int) *Canvas { - return &Canvas{ - enabled: true, - w: w, - h: h, - vertical: h > w, // input is inverted - pool: sync.Pool{New: func() any { - i := Frame{image.RGBA{ - Pix: make([]uint8, size<<2), - Rect: image.Rectangle{Max: image.Point{X: w, Y: h}}, - }} - return &i - }}, - } -} - -func (c *Canvas) Get(w, h int) *Frame { - i := c.pool.Get().(*Frame) - if c.vertical { - w, h = h, w - } - i.Stride = w << 2 - i.Pix = i.Pix[:i.Stride*h] - i.Rect.Max.X = w - i.Rect.Max.Y = h - return i -} - -func (c *Canvas) Put(i *Frame) { - if c.enabled { - c.pool.Put(i) - } -} -func (c *Canvas) Clear() { c.wg = sync.WaitGroup{} } -func (c *Canvas) SetEnabled(enabled bool) { c.enabled = enabled } - -func (c *Canvas) Draw(encoding uint32, rot Rotation, w, h, packedW, bpp int, data []byte, th int) *Frame { - dst := c.Get(w, h) - if th == 0 { - frame(encoding, dst, data, 0, h, w, h, packedW, bpp, rot) - } else { - hn := h / th - c.wg.Add(th) - for i := 0; i < th; i++ { - xx := hn * i - go func() { - frame(encoding, dst, data, xx, hn, w, h, packedW, bpp, rot) - c.wg.Done() - }() - } - c.wg.Wait() - } - - // rescale - if dst.Rect.Dx() != c.w || dst.Rect.Dy() != c.h { - ww := c.w - hh := c.h - // w, h supposedly have been swapped before - if c.vertical { - ww, hh = c.h, c.w - } - out := c.Get(ww, hh) - Resize(ScaleNearestNeighbour, &dst.RGBA, &out.RGBA) - c.Put(dst) - return out - } - - return dst -} - -func frame(encoding uint32, dst *Frame, data []byte, yy int, hn int, w int, h int, pwb int, bpp int, rot Rotation) { - sPtr := unsafe.Pointer(&data[yy*pwb]) - // some cores can zero-right-pad rows to the packed width value - pad := pwb - w*bpp - if pad < 0 { - pad = 0 - } - ds := 0 - if rot == 0 { - ds = yy * dst.Stride - } - dPtr := (*C.uint32_t)(unsafe.Pointer(&dst.Pix[ds])) - C.RGBA(C.int(encoding), dPtr, sPtr, C.int(yy), C.int(yy+hn), C.int(w), C.int(h), C.int(dst.Stride>>2), C.int(pad), C.int(rot)) -} - -func _8888rev(px uint32) uint32 { return uint32(C._8888rev(C.uint32_t(px))) } - -func rotate(t int, x int, y int, w int, h int) (int, int) { - var rot C.xy = C.rotate(C.int(t), C.int(x), C.int(y), C.int(w), C.int(h)) - return int(rot.x), int(rot.y) -} diff --git a/pkg/worker/caged/libretro/image/canvas.h b/pkg/worker/caged/libretro/image/canvas.h deleted file mode 100644 index 5ee04a86b..000000000 --- a/pkg/worker/caged/libretro/image/canvas.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef CANVAS_H__ -#define CANVAS_H__ - -#include - -#define BIT_SHORT5551 0 -#define BIT_INT_8888REV 1 -#define BIT_SHORT565 2 - -#define NO_ROT 0 -#define A90 1 -#define A180 2 -#define A270 3 -#define F180 4 - -typedef struct XY { - int x, y; -} xy; - -xy rotate(int t, int x, int y, int w, int h); - -void RGBA(int pix, uint32_t *dst, const void *source, int y, int h, int w, int hh, int dw, int pad, int rot); - -uint32_t _565(uint32_t x); -uint32_t _8888rev(uint32_t px); - -#endif diff --git a/pkg/worker/caged/libretro/image/canvas_test.go b/pkg/worker/caged/libretro/image/canvas_test.go deleted file mode 100644 index b1def658f..000000000 --- a/pkg/worker/caged/libretro/image/canvas_test.go +++ /dev/null @@ -1,340 +0,0 @@ -package image - -import ( - "bytes" - "fmt" - "testing" -) - -func BenchmarkDraw(b *testing.B) { - w1, h1 := 256, 240 - w2, h2 := 640, 480 - - type args struct { - encoding uint32 - rot Rotation - scaleType int - w int - h int - packedW int - bpp int - data []byte - dw int - dh int - th int - } - tests := []struct { - name string - args args - }{ - { - name: "565_0th", - args: args{ - encoding: BitFormatShort565, scaleType: ScaleNearestNeighbour, - w: w1, h: h1, packedW: w1, bpp: 2, data: make([]uint8, w1*h1*2), dw: w1, dh: h1, th: 0, - }, - }, - { - name: "565_0th_90", - args: args{ - encoding: BitFormatShort565, rot: A90, scaleType: ScaleNearestNeighbour, - w: h1, h: w1, packedW: h1, bpp: 2, data: make([]uint8, w1*h1*2), dw: w1, dh: h1, th: 0, - }, - }, - { - name: "565_0th", - args: args{ - encoding: BitFormatShort565, scaleType: ScaleNearestNeighbour, - w: w2, h: h2, packedW: w1, bpp: 2, data: make([]uint8, w2*h2*2), dw: w2, dh: h2, th: 0, - }, - }, - { - name: "565_4th", - args: args{ - encoding: BitFormatShort565, scaleType: ScaleNearestNeighbour, - w: w1, h: h1, packedW: w1, bpp: 2, data: make([]uint8, w1*h1*2), dw: w1, dh: h1, th: 4, - }, - }, - { - name: "565_4th", - args: args{ - encoding: BitFormatShort565, scaleType: ScaleNearestNeighbour, - w: w2, h: h2, packedW: w2, bpp: 2, data: make([]uint8, w2*h2*2), dw: w2, dh: h2, th: 4, - }, - }, - { - name: "8888 - 0th", - args: args{ - encoding: BitFormatInt8888Rev, scaleType: ScaleNearestNeighbour, - w: w1, h: h1, packedW: w1, bpp: 4, data: make([]uint8, w1*h1*4), dw: w1, dh: h1, th: 0, - }, - }, - { - name: "8888 - 4th", - args: args{ - encoding: BitFormatInt8888Rev, scaleType: ScaleNearestNeighbour, - w: w1, h: h1, packedW: w1, bpp: 4, data: make([]uint8, w1*h1*4), dw: w1, dh: h1, th: 4, - }, - }, - } - - for _, bn := range tests { - c := NewCanvas(bn.args.dw, bn.args.dh, bn.args.dw*bn.args.dh) - img := c.Get(bn.args.dw, bn.args.dh) - c.Put(img) - img2 := c.Get(bn.args.dw, bn.args.dh) - c.Put(img2) - b.ResetTimer() - b.Run(fmt.Sprintf("%vx%v_%v", bn.args.w, bn.args.h, bn.name), func(b *testing.B) { - for i := 0; i < b.N; i++ { - p := c.Draw(bn.args.encoding, bn.args.rot, bn.args.w, bn.args.h, bn.args.packedW, bn.args.bpp, bn.args.data, bn.args.th) - c.Put(p) - } - b.ReportAllocs() - }) - } -} - -func Test_ix8888(t *testing.T) { - type args struct { - dst *uint32 - px uint32 - expect uint32 - } - tests := []struct { - name string - args args - }{ - { - name: "", - args: args{ - dst: new(uint32), - px: 0x11223344, - expect: 0x00443322, - }, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - *tt.args.dst = _8888rev(tt.args.px) - if *tt.args.dst != tt.args.expect { - t.Errorf("nope, %x %x", *tt.args.dst, tt.args.expect) - } - }) - } -} - -type dimensions struct { - w int - h int -} - -func TestRotate(t *testing.T) { - tests := []struct { - // packed bytes from a 2D matrix - input []byte - // original matrix's width - w int - // original matrix's height - h int - // rotation algorithm - rotateHow []Rotation - expected [][]byte - }{ - { - // a cross - []byte{ - 0, 1, 0, - 1, 1, 1, - 0, 1, 0, - }, - 3, 3, []Rotation{0, A90, A180, A270}, - [][]byte{ - { - 0, 1, 0, - 1, 1, 1, - 0, 1, 0, - }, - { - 0, 1, 0, - 1, 1, 1, - 0, 1, 0, - }, - { - 0, 1, 0, - 1, 1, 1, - 0, 1, 0, - }, - { - 0, 1, 0, - 1, 1, 1, - 0, 1, 0, - }, - }, - }, - { - []byte{ - 1, 2, - 3, 4, - 5, 6, - 7, 8, - }, - 2, 4, []Rotation{0, A90, A180, A270}, - [][]byte{ - { - 1, 2, - 3, 4, - 5, 6, - 7, 8, - }, - { - 2, 4, 6, 8, - 1, 3, 5, 7, - }, - { - 8, 7, - 6, 5, - 4, 3, - 2, 1, - }, - { - 7, 5, 3, 1, - 8, 6, 4, 2, - }, - }, - }, - { - // a square - []byte{ - 1, 0, 0, 0, 0, 0, 0, 0, - 0, 1, 1, 1, 1, 1, 1, 0, - 0, 1, 1, 1, 1, 1, 1, 0, - 0, 1, 0, 0, 0, 0, 1, 0, - 0, 1, 1, 1, 1, 1, 1, 0, - 0, 0, 0, 0, 0, 0, 0, 1, - }, - 8, 6, []Rotation{0, A90, A180, A270}, - [][]byte{ - { - // L // R - 1, 0, 0, 0, 0, 0, 0, 0, - 0, 1, 1, 1, 1, 1, 1, 0, - 0, 1, 1, 1, 1, 1, 1, 0, - 0, 1, 0, 0, 0, 0, 1, 0, - 0, 1, 1, 1, 1, 1, 1, 0, - 0, 0, 0, 0, 0, 0, 0, 1, - }, - { - 0, 0, 0, 0, 0, 1, - 0, 1, 1, 1, 1, 0, - 0, 1, 1, 0, 1, 0, - 0, 1, 1, 0, 1, 0, - 0, 1, 1, 0, 1, 0, - 0, 1, 1, 0, 1, 0, - 0, 1, 1, 1, 1, 0, - 1, 0, 0, 0, 0, 0, - }, - - { - 1, 0, 0, 0, 0, 0, 0, 0, - 0, 1, 1, 1, 1, 1, 1, 0, - 0, 1, 0, 0, 0, 0, 1, 0, - 0, 1, 1, 1, 1, 1, 1, 0, - 0, 1, 1, 1, 1, 1, 1, 0, - 0, 0, 0, 0, 0, 0, 0, 1, - }, - { - 0, 0, 0, 0, 0, 1, - 0, 1, 1, 1, 1, 0, - 0, 1, 0, 1, 1, 0, - 0, 1, 0, 1, 1, 0, - 0, 1, 0, 1, 1, 0, - 0, 1, 0, 1, 1, 0, - 0, 1, 1, 1, 1, 0, - 1, 0, 0, 0, 0, 0, - }, - }, - }, - } - - for _, test := range tests { - for i, rot := range test.rotateHow { - if output := exampleRotate(test.input, test.w, test.h, rot); !bytes.Equal(output, test.expected[i]) { - t.Errorf( - "Test fail for angle %v with %v that should be \n%v but it's \n%v", - rot, test.input, test.expected[i], output) - } - } - } -} - -func TestBoundsAfterRotation(t *testing.T) { - tests := []struct { - dim []dimensions - rotateHow []Rotation - }{ - { - // a combinatorics lib would be nice instead - []dimensions{ - // square - {w: 100, h: 100}, - // even w/h - {w: 100, h: 50}, - // even h/w - {w: 50, h: 100}, - // odd even w/h - {w: 77, h: 32}, - // even odd h/w - {w: 32, h: 77}, - // just odd - {w: 13, h: 19}, - }, - []Rotation{0, A90, A180, A270}, - }, - } - - for _, test := range tests { - for _, rot := range test.rotateHow { - for _, dim := range test.dim { - - for y := 0; y < dim.h; y++ { - for x := 0; x < dim.w; x++ { - - xx, yy := rotate(int(rot), x, y, dim.w, dim.h) - - if rot == A90 || rot == A270 { // is even - yy, xx = xx, yy - } - - if xx < 0 || xx > dim.w { - t.Errorf("Rot %v, coordinate x should be in range [0; %v]: %v", rot, dim.w-1, xx) - } - - if yy < 0 || yy > dim.h { - t.Errorf("Rot %v, coordinate y should be in range [0; %v]: %v", rot, dim.h-1, yy) - } - } - } - } - } - } -} - -// exampleRotate is an example of rotation usage. -// -// [1 2 3 4 5 6 7 8 9] -// [7 4 1 8 5 2 9 6 3] -func exampleRotate(data []uint8, w int, h int, rot Rotation) []uint8 { - dest := make([]uint8, len(data)) - for y := 0; y < h; y++ { - for x := 0; x < w; x++ { - nx, ny := rotate(int(rot), x, y, w, h) - stride := w - if rot == A90 || rot == A270 { // is even - stride = h - } - dest[nx+ny*stride] = data[x+y*w] - } - } - return dest -} diff --git a/pkg/worker/caged/libretro/manager/http.go b/pkg/worker/caged/libretro/manager/http.go index 8609314e9..1f2dbc881 100644 --- a/pkg/worker/caged/libretro/manager/http.go +++ b/pkg/worker/caged/libretro/manager/http.go @@ -2,6 +2,7 @@ package manager import ( "os" + "path/filepath" "github.com/giongto35/cloud-game/v3/pkg/config" "github.com/giongto35/cloud-game/v3/pkg/logger" @@ -31,6 +32,15 @@ func NewRemoteHttpManager(conf config.LibretroConfig, log *logger.Logger) Manage } log.Debug().Msgf("Using .lock file: %v", fileLock) + if err := os.MkdirAll(filepath.Dir(fileLock), 0770); err != nil { + log.Error().Err(err).Msgf("couldn't create lock") + } else { + f, err := os.Create(fileLock) + if err != nil { + log.Error().Err(err).Msgf("couldn't create lock") + } + _ = f.Close() + } ar, err := arch.Guess() if err != nil { log.Error().Err(err).Msg("couldn't get Libretro core file extension") @@ -73,8 +83,16 @@ func CheckCores(conf config.Emulator, log *logger.Logger) error { func (m *Manager) Sync() error { // IPC lock if multiple worker processes on the same machine - m.fmu.Lock() - defer m.fmu.Unlock() + err := m.fmu.Lock() + if err != nil { + m.log.Error().Err(err).Msg("file lock fail") + } + defer func() { + err := m.fmu.Unlock() + if err != nil { + m.log.Error().Err(err).Msg("file unlock fail") + } + }() installed, err := m.GetInstalled(m.arch.LibExt) if err != nil { diff --git a/pkg/worker/caged/libretro/nanoarch/nanoarch.go b/pkg/worker/caged/libretro/nanoarch/nanoarch.go index 841ae8c45..b601aadea 100644 --- a/pkg/worker/caged/libretro/nanoarch/nanoarch.go +++ b/pkg/worker/caged/libretro/nanoarch/nanoarch.go @@ -12,7 +12,6 @@ import ( "github.com/giongto35/cloud-game/v3/pkg/logger" "github.com/giongto35/cloud-game/v3/pkg/os" "github.com/giongto35/cloud-game/v3/pkg/worker/caged/libretro/graphics" - "github.com/giongto35/cloud-game/v3/pkg/worker/caged/libretro/image" "github.com/giongto35/cloud-game/v3/pkg/worker/caged/libretro/repo/arch" "github.com/giongto35/cloud-game/v3/pkg/worker/thread" ) @@ -33,6 +32,12 @@ const KeyReleased = 0 const MaxPort int = 4 +var ( + RGBA5551 = PixFmt{C: 0, BPP: 2} // BIT_FORMAT_SHORT_5_5_5_1 has 5 bits R, 5 bits G, 5 bits B, 1 bit alpha + RGBA8888Rev = PixFmt{C: 1, BPP: 4} // BIT_FORMAT_INT_8_8_8_8_REV has 8 bits R, 8 bits G, 8 bits B, 8 bit alpha + RGB565 = PixFmt{C: 2, BPP: 2} // BIT_FORMAT_SHORT_5_6_5 has 5 bits R, 6 bits G, 5 bits +) + type Nanoarch struct { Handlers LastFrameTime int64 @@ -44,7 +49,7 @@ type Nanoarch struct { } options *map[string]string reserved chan struct{} // limits concurrent use - Rot image.Rotation + Rot uint serializeSize C.size_t stopped atomic.Bool sysAvInfo C.struct_retro_system_av_info @@ -58,9 +63,8 @@ type Nanoarch struct { enabled bool autoCtx bool } - BPP uint hw *C.struct_retro_hw_render_callback - PixFmt uint32 + PixFmt PixFmt } vfr bool sdlCtx *graphics.SDL @@ -78,7 +82,7 @@ type Handlers struct { type FrameInfo struct { W uint H uint - Packed uint + Stride uint } type Metadata struct { @@ -92,6 +96,24 @@ type Metadata struct { Hacks []string } +type PixFmt struct { + C uint32 + BPP uint +} + +func (p PixFmt) String() string { + switch p.C { + case 0: + return "RGBA5551/2" + case 1: + return "RGBA8888Rev/4" + case 2: + return "RGB565/2" + default: + return fmt.Sprintf("Unknown (%v/%v)", p.C, p.BPP) + } +} + // Nan0 is a global link for C callbacks to Go var Nan0 = Nanoarch{ reserved: make(chan struct{}, 1), // this thing forbids concurrent use of the emulator @@ -118,7 +140,7 @@ func NewNano(localPath string) *Nanoarch { func (n *Nanoarch) AudioSampleRate() int { return int(n.sysAvInfo.timing.sample_rate) } func (n *Nanoarch) VideoFramerate() int { return int(n.sysAvInfo.timing.fps) } -func (n *Nanoarch) IsPortrait() bool { return n.Rot == image.A90 || n.Rot == image.A270 } +func (n *Nanoarch) IsPortrait() bool { return n.Rot == 90 || n.Rot == 270 } func (n *Nanoarch) GeometryBase() (int, int) { return int(n.sysAvInfo.geometry.base_width), int(n.sysAvInfo.geometry.base_height) } @@ -252,7 +274,7 @@ func (n *Nanoarch) LoadGame(path string) error { if n.Video.gl.enabled { //setRotation(image.F180) // flip Y coordinates of OpenGL - bufS := uint(n.sysAvInfo.geometry.max_width*n.sysAvInfo.geometry.max_height) * n.Video.BPP + bufS := uint(n.sysAvInfo.geometry.max_width*n.sysAvInfo.geometry.max_height) * n.Video.PixFmt.BPP graphics.SetBuffer(int(bufS)) n.log.Info().Msgf("Set buffer: %v", byteCountBinary(int64(bufS))) if n.LibCo { @@ -357,34 +379,33 @@ func (n *Nanoarch) IsStopped() bool { return n.stopped.Load() } func videoSetPixelFormat(format uint32) (C.bool, error) { switch format { case C.RETRO_PIXEL_FORMAT_0RGB1555: - Nan0.Video.PixFmt = image.BitFormatShort5551 + Nan0.Video.PixFmt = RGBA5551 if err := graphics.SetPixelFormat(graphics.UnsignedShort5551); err != nil { return false, fmt.Errorf("unknown pixel format %v", Nan0.Video.PixFmt) } - Nan0.Video.BPP = 2 // format is not implemented return false, fmt.Errorf("unsupported pixel type %v converter", format) case C.RETRO_PIXEL_FORMAT_XRGB8888: - Nan0.Video.PixFmt = image.BitFormatInt8888Rev + Nan0.Video.PixFmt = RGBA8888Rev if err := graphics.SetPixelFormat(graphics.UnsignedInt8888Rev); err != nil { return false, fmt.Errorf("unknown pixel format %v", Nan0.Video.PixFmt) } - Nan0.Video.BPP = 4 case C.RETRO_PIXEL_FORMAT_RGB565: - Nan0.Video.PixFmt = image.BitFormatShort565 + Nan0.Video.PixFmt = RGB565 if err := graphics.SetPixelFormat(graphics.UnsignedShort565); err != nil { return false, fmt.Errorf("unknown pixel format %v", Nan0.Video.PixFmt) } - Nan0.Video.BPP = 2 default: return false, fmt.Errorf("unknown pixel type %v", format) } + Nan0.log.Info().Msgf("Pixel format: %v", Nan0.Video.PixFmt) + return true, nil } -func setRotation(rotation image.Rotation) { - Nan0.Rot = rotation - Nan0.log.Debug().Msgf("Image rotated %v°", map[uint]uint{0: 0, 1: 90, 2: 180, 3: 270}[uint(rotation)]) +func setRotation(rot uint) { + Nan0.Rot = rot + Nan0.log.Debug().Msgf("Image rotated %v°", rot) } func printOpenGLDriverInfo() { @@ -557,7 +578,7 @@ func coreVideoRefresh(data unsafe.Pointer, width, height uint, packed uint) { // calculate real frame width in pixels from packed data (realWidth >= width) // some cores or games output zero pitch, i.e. N64 Mupen if packed == 0 { - packed = width * Nan0.Video.BPP + packed = width * Nan0.Video.PixFmt.BPP } // calculate space for the video frame bytes := packed * height @@ -575,7 +596,7 @@ func coreVideoRefresh(data unsafe.Pointer, width, height uint, packed uint) { // also we have an option of xN output frame magnification // so, it may be rescaled - Nan0.Handlers.OnVideo(data_, int32(dt), FrameInfo{W: width, H: height, Packed: packed}) + Nan0.Handlers.OnVideo(data_, int32(dt), FrameInfo{W: width, H: height, Stride: packed}) } //export coreInputPoll @@ -665,8 +686,16 @@ func coreEnvironment(cmd C.unsigned, data unsafe.Pointer) C.bool { } switch cmd { + case C.RETRO_ENVIRONMENT_SET_SYSTEM_AV_INFO: + av := *(*C.struct_retro_system_av_info)(data) + Nan0.log.Info().Msgf(">>> SET SYS AV INFO: %v", av) + return true + case C.RETRO_ENVIRONMENT_SET_GEOMETRY: + geom := *(*C.struct_retro_game_geometry)(data) + Nan0.log.Info().Msgf(">>> GEOMETRY: %v", geom) + return true case C.RETRO_ENVIRONMENT_SET_ROTATION: - setRotation(image.Rotation(*(*uint)(data) % 4)) + setRotation((*(*uint)(data) % 4) * 90) return true case C.RETRO_ENVIRONMENT_GET_CAN_DUPE: *(*C.bool)(data) = C.bool(true) diff --git a/pkg/worker/caged/libretro/recording.go b/pkg/worker/caged/libretro/recording.go index 7c128aeab..cc4cdcdde 100644 --- a/pkg/worker/caged/libretro/recording.go +++ b/pkg/worker/caged/libretro/recording.go @@ -1,7 +1,6 @@ package libretro import ( - "image" "time" "github.com/giongto35/cloud-game/v3/pkg/config" @@ -15,23 +14,29 @@ type RecordingFrontend struct { rec *recorder.Recording } -// !to fix opaque image save - -type opaque struct{ image.RGBA } +func WithRecording(fe Emulator, rec bool, user string, game string, conf config.Recording, log *logger.Logger) *RecordingFrontend { -func (o *opaque) Opaque() bool { return true } + pix := "" + switch fe.PixFormat() { + case 0: + pix = "rgb1555" + case 1: + pix = "brga" + case 2: + pix = "rgb565" + } -func WithRecording(fe Emulator, rec bool, user string, game string, conf config.Recording, log *logger.Logger) *RecordingFrontend { rr := &RecordingFrontend{Emulator: fe, rec: recorder.NewRecording( recorder.Meta{UserName: user}, log, recorder.Options{ - Dir: conf.Folder, - Game: game, - ImageCompressionLevel: conf.CompressLevel, - Name: conf.Name, - Zip: conf.Zip, - Vsync: true, + Dir: conf.Folder, + Game: game, + Name: conf.Name, + Zip: conf.Zip, + Vsync: true, + Flip: fe.Flipped(), + Pix: pix, })} rr.ToggleRecording(rec, user) return rr @@ -52,7 +57,7 @@ func (r *RecordingFrontend) SetAudioCb(fn func(app.Audio)) { func (r *RecordingFrontend) SetVideoCb(fn func(app.Video)) { r.Emulator.SetVideoCb(func(v app.Video) { if r.IsRecording() { - r.rec.WriteVideo(recorder.Video{Image: &opaque{v.Frame}, Duration: time.Duration(v.Duration)}) + r.rec.WriteVideo(recorder.Video{Frame: recorder.Frame(v.Frame), Duration: time.Duration(v.Duration)}) } fn(v) }) diff --git a/pkg/worker/coordinatorhandlers.go b/pkg/worker/coordinatorhandlers.go index ebce60622..2a791c1b6 100644 --- a/pkg/worker/coordinatorhandlers.go +++ b/pkg/worker/coordinatorhandlers.go @@ -121,6 +121,7 @@ func (c *coordinator) HandleGameStart(rq api.StartGameRequest[com.Uid], w *Worke m.AudioSrcHz = app.AudioSampleRate() m.AudioFrame = w.conf.Encoder.Audio.Frame m.VideoW, m.VideoH = app.ViewportSize() + m.VideoScale = app.Scale() r.SetMedia(m) diff --git a/pkg/worker/media/media.go b/pkg/worker/media/media.go index 5f53324a3..5b33404e6 100644 --- a/pkg/worker/media/media.go +++ b/pkg/worker/media/media.go @@ -103,11 +103,11 @@ func (s samples) stretch(size int) []int16 { } type WebrtcMediaPipe struct { + a *opus.Encoder + v *encoder.Video onAudio func([]byte) - opus *opus.Encoder audioBuf buffer log *logger.Logger - enc *encoder.VideoEncoder aConf config.Audio vConf config.Video @@ -115,6 +115,7 @@ type WebrtcMediaPipe struct { AudioSrcHz int AudioFrame int VideoW, VideoH int + VideoScale float64 } func NewWebRtcMediaPipe(ac config.Audio, vc config.Video, log *logger.Logger) *WebrtcMediaPipe { @@ -126,8 +127,8 @@ func (wmp *WebrtcMediaPipe) SetAudioCb(cb func([]byte, int32)) { wmp.onAudio = func(bytes []byte) { cb(bytes, fr) } } func (wmp *WebrtcMediaPipe) Destroy() { - if wmp.enc != nil { - wmp.enc.Stop() + if wmp.v != nil { + wmp.v.Stop() } } func (wmp *WebrtcMediaPipe) PushAudio(audio []int16) { wmp.audioBuf.write(audio, wmp.encodeAudio) } @@ -136,7 +137,7 @@ func (wmp *WebrtcMediaPipe) Init() error { if err := wmp.initAudio(wmp.AudioSrcHz, wmp.AudioFrame); err != nil { return err } - if err := wmp.initVideo(wmp.VideoW, wmp.VideoH, wmp.vConf); err != nil { + if err := wmp.initVideo(wmp.VideoW, wmp.VideoH, wmp.VideoScale, wmp.vConf); err != nil { return err } return nil @@ -148,7 +149,7 @@ func (wmp *WebrtcMediaPipe) initAudio(srcHz int, frameSize int) error { return fmt.Errorf("opus fail: %w", err) } wmp.log.Debug().Msgf("Opus: %v", au.GetInfo()) - wmp.opus = au + wmp.a = au buf := newBuffer(frame(srcHz, frameSize)) dstHz, _ := au.SampleRate() if srcHz != dstHz { @@ -160,7 +161,7 @@ func (wmp *WebrtcMediaPipe) initAudio(srcHz int, frameSize int) error { } func (wmp *WebrtcMediaPipe) encodeAudio(pcm samples) { - data, err := wmp.opus.Encode(pcm) + data, err := wmp.a.Encode(pcm) audioPool.Put((*[]int16)(&pcm)) if err != nil { wmp.log.Error().Err(err).Msgf("opus encode fail") @@ -169,25 +170,36 @@ func (wmp *WebrtcMediaPipe) encodeAudio(pcm samples) { wmp.onAudio(data) } -func (wmp *WebrtcMediaPipe) initVideo(w, h int, conf config.Video) error { +func (wmp *WebrtcMediaPipe) initVideo(w, h int, scale float64, conf config.Video) error { var enc encoder.Encoder var err error + + sw, sh := round(w, scale), round(h, scale) + + wmp.log.Debug().Msgf("Scale: %vx%v -> %vx%v", w, h, sw, sh) + wmp.log.Info().Msgf("Video codec: %v", conf.Codec) if conf.Codec == string(encoder.H264) { wmp.log.Debug().Msgf("x264: build v%v", h264.LibVersion()) opts := h264.Options(conf.H264) - enc, err = h264.NewEncoder(w, h, &opts) + enc, err = h264.NewEncoder(sw, sh, &opts) } else { opts := vpx.Options(conf.Vpx) - enc, err = vpx.NewEncoder(w, h, &opts) + enc, err = vpx.NewEncoder(sw, sh, &opts) } if err != nil { return fmt.Errorf("couldn't create a video encoder: %w", err) } - wmp.enc = encoder.NewVideoEncoder(enc, w, h, conf.Concurrency, wmp.log) + wmp.v = encoder.NewVideoEncoder(enc, w, h, scale, wmp.log) + wmp.log.Debug().Msgf("%v", wmp.v.Info()) return nil } -func (wmp *WebrtcMediaPipe) ProcessVideo(v app.Video) []byte { return wmp.enc.Encode(&v.Frame) } +func round(x int, scale float64) int { return (int(float64(x)*scale) + 1) & ^1 } -func (wmp *WebrtcMediaPipe) SetVideoFlip(b bool) { wmp.enc.SetFlip(b) } +func (wmp *WebrtcMediaPipe) ProcessVideo(v app.Video) []byte { + return wmp.v.Encode(encoder.InFrame(v.Frame)) +} +func (wmp *WebrtcMediaPipe) SetPixFmt(f uint32) { wmp.v.SetPixFormat(f) } +func (wmp *WebrtcMediaPipe) SetVideoFlip(b bool) { wmp.v.SetFlip(b) } +func (wmp *WebrtcMediaPipe) SetRot(r uint) { wmp.v.SetRot(r) } diff --git a/pkg/worker/media/media_test.go b/pkg/worker/media/media_test.go index 612be7a2a..e99522efa 100644 --- a/pkg/worker/media/media_test.go +++ b/pkg/worker/media/media_test.go @@ -46,7 +46,7 @@ func run(w, h int, cod encoder.VideoCodec, count int, a *image.RGBA, b *image.RG } logger.SetGlobalLevel(logger.Disabled) - ve := encoder.NewVideoEncoder(enc, w, h, 8, l) + ve := encoder.NewVideoEncoder(enc, w, h, 1, l) defer ve.Stop() if a == nil { @@ -61,7 +61,12 @@ func run(w, h int, cod encoder.VideoCodec, count int, a *image.RGBA, b *image.RG if i%2 == 0 { im = b } - out := ve.Encode(im) + out := ve.Encode(encoder.InFrame{ + Data: im.Pix, + Stride: im.Stride, + W: im.Bounds().Dx(), + H: im.Bounds().Dy(), + }) if out == nil { backend.Fatalf("encoder closed abnormally") } diff --git a/pkg/worker/recorder/ffmpegmux.go b/pkg/worker/recorder/ffmpegmux.go index 4869ef712..37c9df6aa 100644 --- a/pkg/worker/recorder/ffmpegmux.go +++ b/pkg/worker/recorder/ffmpegmux.go @@ -15,6 +15,8 @@ const demuxFile = "input.txt" // ffmpeg concat demuxer, see: https://ffmpeg.org/ffmpeg-formats.html#concat // example: // +// !to change +// // ffmpeg -f concat -i input.txt \ // -ac 2 -channel_layout stereo -i audio.wav \ // -b:a 192K -crf 23 -vf fps=30 -pix_fmt yuv420p \ @@ -25,9 +27,17 @@ func createFfmpegMuxFile(dir string, fPattern string, frameTimes []time.Duration return err } defer func() { er = demux.Close() }() - _, err = demux.WriteString( - fmt.Sprintf("ffconcat version 1.0\n# v: 1\n# date: %v\n# game: %v\n# fps: %v\n# freq (hz): %v\n\n", - time.Now().Format("20060102"), opts.Game, opts.Fps, opts.Frequency)) + + b := strings.Builder{} + + b.WriteString("ffconcat version 1.0\n") + b.WriteString(meta("v", "1")) + b.WriteString(meta("date", time.Now().Format("20060102"))) + b.WriteString(meta("game", opts.Game)) + b.WriteString(meta("fps", opts.Fps)) + b.WriteString(meta("freq", opts.Frequency)) + b.WriteString(meta("pix", opts.Pix)) + _, err = demux.WriteString(fmt.Sprintf("%s\n", b.String())) if err != nil { return err } @@ -51,7 +61,9 @@ func createFfmpegMuxFile(dir string, fPattern string, frameTimes []time.Duration } i++ } - inf := fmt.Sprintf("file %v\nduration %f\n", name, dur) + w, h, s := ExtractFileInfo(file.Name()) + inf := fmt.Sprintf("file %v\nduration %f\n%s%s%s", name, dur, + metaf("width", w), metaf("height", h), metaf("stride", s)) if _, err := demux.WriteString(inf); err != nil { er = err } @@ -61,3 +73,11 @@ func createFfmpegMuxFile(dir string, fPattern string, frameTimes []time.Duration } return er } + +// meta adds stream_meta key value line. +func meta(key string, value any) string { return fmt.Sprintf("stream_meta %s '%v'\n", key, value) } + +// metaf adds file_packet_meta key value line. +func metaf(key string, value any) string { + return fmt.Sprintf("file_packet_meta %s '%v'\n", key, value) +} diff --git a/pkg/worker/recorder/options.go b/pkg/worker/recorder/options.go index 9707e1711..fe4ca7ce4 100644 --- a/pkg/worker/recorder/options.go +++ b/pkg/worker/recorder/options.go @@ -1,14 +1,18 @@ package recorder type Options struct { - Dir string - Fps float64 - Frequency int - Game string - ImageCompressionLevel int - Name string - Zip bool - Vsync bool + Dir string + Fps float64 + W int + H int + Stride int + Flip bool + Frequency int + Pix string + Game string + Name string + Zip bool + Vsync bool } type Meta struct { diff --git a/pkg/worker/recorder/pngstream.go b/pkg/worker/recorder/pngstream.go deleted file mode 100644 index 1cb0cf884..000000000 --- a/pkg/worker/recorder/pngstream.go +++ /dev/null @@ -1,72 +0,0 @@ -package recorder - -import ( - "bytes" - "fmt" - "image" - "image/png" - "log" - "os" - "path/filepath" - "sync" - "sync/atomic" -) - -type pngStream struct { - dir string - e *png.Encoder - id uint32 - wg sync.WaitGroup -} - -const videoFile = "f%07d.png" - -type pool struct{ sync.Pool } - -func pngBuf() *pool { return &pool{sync.Pool{New: func() any { return &png.EncoderBuffer{} }}} } -func (p *pool) Get() *png.EncoderBuffer { return p.Pool.Get().(*png.EncoderBuffer) } -func (p *pool) Put(b *png.EncoderBuffer) { p.Pool.Put(b) } - -func newPngStream(dir string, opts Options) (*pngStream, error) { - return &pngStream{ - dir: dir, - e: &png.Encoder{ - CompressionLevel: png.CompressionLevel(opts.ImageCompressionLevel), - BufferPool: pngBuf(), - }, - }, nil -} - -func (p *pngStream) Close() error { - atomic.StoreUint32(&p.id, 0) - p.wg.Wait() - return nil -} - -func (p *pngStream) Write(data Video) { - fileName := fmt.Sprintf(videoFile, atomic.AddUint32(&p.id, 1)) - p.wg.Add(1) - go p.saveImage(fileName, data.Image) -} - -func (p *pngStream) saveImage(fileName string, img image.Image) { - var buf bytes.Buffer - x, y := (img).Bounds().Dx(), (img).Bounds().Dy() - buf.Grow(x * y * 4) - - if err := p.e.Encode(&buf, img); err != nil { - log.Printf("p err: %v", err) - } else { - file, err := os.Create(filepath.Join(p.dir, fileName)) - if err != nil { - log.Printf("c err: %v", err) - } - if _, err = file.Write(buf.Bytes()); err != nil { - log.Printf("f err: %v", err) - } - if err = file.Close(); err != nil { - log.Printf("fc err: %v", err) - } - } - p.wg.Done() -} diff --git a/pkg/worker/recorder/rawstream.go b/pkg/worker/recorder/rawstream.go new file mode 100644 index 000000000..26b8875c3 --- /dev/null +++ b/pkg/worker/recorder/rawstream.go @@ -0,0 +1,66 @@ +package recorder + +import ( + "fmt" + "log" + "os" + "path/filepath" + "strings" + "sync" + "sync/atomic" +) + +type rawStream struct { + dir string + id uint32 + wg sync.WaitGroup +} + +const videoFile = "f%07d__%dx%d__%d.raw" + +func newRawStream(dir string) (*rawStream, error) { + return &rawStream{dir: dir}, nil +} + +func (p *rawStream) Close() error { + atomic.StoreUint32(&p.id, 0) + p.wg.Wait() + return nil +} + +func (p *rawStream) Write(data Video) { + i := atomic.AddUint32(&p.id, 1) + fileName := fmt.Sprintf(videoFile, i, data.Frame.W, data.Frame.H, data.Frame.Stride) + p.wg.Add(1) + go p.saveFrame(fileName, data.Frame) +} + +func (p *rawStream) saveFrame(fileName string, frame Frame) { + file, err := os.Create(filepath.Join(p.dir, fileName)) + if err != nil { + log.Printf("c err: %v", err) + } + if _, err = file.Write(frame.Data); err != nil { + log.Printf("f err: %v", err) + } + + if err = file.Close(); err != nil { + log.Printf("fc err: %v", err) + } + p.wg.Done() +} + +func ExtractFileInfo(name string) (w, h, st string) { + s1 := strings.Split(name, "__") + if len(s1) > 1 { + s12 := strings.Split(s1[1], "x") + if len(s12) > 1 { + w, h = s12[0], s12[1] + } + s21 := strings.TrimSuffix(s1[2], filepath.Ext(s1[2])) + if s21 != "" { + st = s21 + } + } + return +} diff --git a/pkg/worker/recorder/recorder.go b/pkg/worker/recorder/recorder.go index 24edc4c8d..4c3d12078 100644 --- a/pkg/worker/recorder/recorder.go +++ b/pkg/worker/recorder/recorder.go @@ -1,7 +1,6 @@ package recorder import ( - "image" "io" "math/rand" "os" @@ -60,9 +59,14 @@ type ( Duration time.Duration } Video struct { - Image image.Image + Frame Frame Duration time.Duration } + Frame struct { + Data []byte + Stride int + W, H int + } ) // NewRecording creates new media recorder for the emulator. @@ -96,7 +100,7 @@ func (r *Recording) Start() { r.log.Fatal().Err(err) } r.audio = audio - video, err := newPngStream(path, r.opts) + video, err := newRawStream(path) if err != nil { r.log.Fatal().Err(err) } diff --git a/pkg/worker/recorder/recorder_test.go b/pkg/worker/recorder/recorder_test.go index e44417491..8d2fa998f 100644 --- a/pkg/worker/recorder/recorder_test.go +++ b/pkg/worker/recorder/recorder_test.go @@ -30,13 +30,12 @@ func TestName(t *testing.T) { Meta{UserName: "test"}, logger.Default(), Options{ - Dir: dir, - Fps: 60, - Frequency: 10, - Game: fmt.Sprintf("test_game_%v", rand.Int()), - ImageCompressionLevel: 0, - Name: "test", - Zip: false, + Dir: dir, + Fps: 60, + Frequency: 10, + Game: fmt.Sprintf("test_game_%v", rand.Int()), + Name: "test", + Zip: false, }) recorder.Set(true, "test_user") @@ -45,11 +44,11 @@ func TestName(t *testing.T) { var imgWg, audioWg sync.WaitGroup imgWg.Add(iterations) audioWg.Add(iterations) - img := generateImage(100, 100) + frame := genFrame(100, 100) for i := 0; i < 222; i++ { go func() { - recorder.WriteVideo(Video{Image: img, Duration: 16 * time.Millisecond}) + recorder.WriteVideo(Video{Frame: frame, Duration: 16 * time.Millisecond}) imgWg.Done() }() go func() { @@ -66,17 +65,14 @@ func TestName(t *testing.T) { } func BenchmarkNewRecording100x100(b *testing.B) { - benchmarkRecorder(100, 100, 0, b) + benchmarkRecorder(100, 100, b) } -func BenchmarkNewRecording320x240_compressed(b *testing.B) { - benchmarkRecorder(320, 240, 0, b) -} -func BenchmarkNewRecording320x240_nocompress(b *testing.B) { - benchmarkRecorder(320, 240, -1, b) +func BenchmarkNewRecording320x240(b *testing.B) { + benchmarkRecorder(320, 240, b) } -func benchmarkRecorder(w, h int, comp int, b *testing.B) { +func benchmarkRecorder(w, h int, b *testing.B) { b.StopTimer() dir, err := os.MkdirTemp("", "rec_bench_") @@ -89,8 +85,8 @@ func benchmarkRecorder(w, h int, comp int, b *testing.B) { } }() - image1 := generateImage(w, h) - image2 := generateImage(w, h) + frame1 := genFrame(w, h) + frame2 := genFrame(w, h) var bytes int64 = 0 @@ -103,25 +99,24 @@ func benchmarkRecorder(w, h int, comp int, b *testing.B) { Meta{UserName: "test"}, logger.Default(), Options{ - Dir: dir, - Fps: 60, - Frequency: 10, - Game: fmt.Sprintf("test_game_%v", rand.Int()), - ImageCompressionLevel: comp, - Name: "", - Zip: false, + Dir: dir, + Fps: 60, + Frequency: 10, + Game: fmt.Sprintf("test_game_%v", rand.Int()), + Name: "", + Zip: false, }) recorder.Set(true, "test_user") samples := []int16{0, 0, 0, 0, 0, 1, 11, 11, 11, 1} for i := 0; i < b.N; i++ { - im := image1 + f := frame1 if i%2 == 0 { - im = image2 + f = frame2 } go func() { - recorder.WriteVideo(Video{Image: im, Duration: 16 * time.Millisecond}) - atomic.AddInt64(&bytes, int64(len(im.(*image.RGBA).Pix))) + recorder.WriteVideo(Video{Frame: f, Duration: 16 * time.Millisecond}) + atomic.AddInt64(&bytes, int64(len(f.Data))) ticks.Done() }() go func() { @@ -137,14 +132,19 @@ func benchmarkRecorder(w, h int, comp int, b *testing.B) { } } -func generateImage(w, h int) image.Image { +func genFrame(w, h int) Frame { img := image.NewRGBA(image.Rect(0, 0, w, h)) for x := 0; x < w; x++ { for y := 0; y < h; y++ { img.Set(x, y, randomColor()) } } - return img + return Frame{ + Data: img.Pix, + Stride: img.Stride, + W: img.Bounds().Dx(), + H: img.Bounds().Dy(), + } } var rnd = rand.New(rand.NewSource(time.Now().Unix())) diff --git a/pkg/worker/room/room_test.go b/pkg/worker/room/room_test.go index 36bd25d44..ed67e48cb 100644 --- a/pkg/worker/room/room_test.go +++ b/pkg/worker/room/room_test.go @@ -19,16 +19,20 @@ import ( "github.com/giongto35/cloud-game/v3/pkg/com" "github.com/giongto35/cloud-game/v3/pkg/config" "github.com/giongto35/cloud-game/v3/pkg/encoder" + "github.com/giongto35/cloud-game/v3/pkg/encoder/color/bgra" + "github.com/giongto35/cloud-game/v3/pkg/encoder/color/rgb565" + "github.com/giongto35/cloud-game/v3/pkg/encoder/color/rgba" "github.com/giongto35/cloud-game/v3/pkg/games" "github.com/giongto35/cloud-game/v3/pkg/logger" "github.com/giongto35/cloud-game/v3/pkg/worker/caged" "github.com/giongto35/cloud-game/v3/pkg/worker/caged/app" - canvas "github.com/giongto35/cloud-game/v3/pkg/worker/caged/libretro/image" "github.com/giongto35/cloud-game/v3/pkg/worker/media" "github.com/giongto35/cloud-game/v3/pkg/worker/thread" "golang.org/x/image/font" "golang.org/x/image/font/basicfont" "golang.org/x/image/math/fixed" + + _ "github.com/giongto35/cloud-game/v3/test" ) var ( @@ -58,13 +62,15 @@ func (r testRoom) Close() { time.Sleep(2 * time.Second) // hack: wait room destruction (atm impossible to tell) } -func (r testRoom) WaitFrames(n int) canvas.Frame { - var frame canvas.Frame +func (r testRoom) WaitFrame(n int) app.RawFrame { var wg sync.WaitGroup - wg.Add(n) + wg.Add(1) + target := app.RawFrame{} WithEmulator(r.app).SetVideoCb(func(v app.Video) { - if n > 0 { - frame = (&canvas.Frame{RGBA: v.Frame}).Copy() + if n == 1 { + target = v.Frame + target.Data = make([]byte, len(v.Frame.Data)) + copy(target.Data, v.Frame.Data) wg.Done() } n-- @@ -73,7 +79,7 @@ func (r testRoom) WaitFrames(n int) canvas.Frame { r.StartApp() } wg.Wait() - return frame + return target } type testParams struct { @@ -81,11 +87,11 @@ type testParams struct { game games.GameMetadata codecs []codec frames int + color int } // Store absolute path to test games var testTempDir = filepath.Join(os.TempDir(), "cloud-game-core-tests") -var root = "" // games var ( @@ -94,12 +100,6 @@ var ( fd = games.GameMetadata{Name: "Florian Demo", Type: "n64", Path: "Sample Demo by Florian (PD).z64", System: "n64"} ) -func init() { - runtime.LockOSThread() - p, _ := filepath.Abs("../../../") - root = p + string(filepath.Separator) -} - func TestMain(m *testing.M) { flag.BoolVar(&renderFrames, "renderFrames", false, "Render frames for eye testing purposes") flag.StringVar(&outputPath, "outputPath", "./", "Output path for generated files") @@ -115,40 +115,51 @@ func TestRoom(t *testing.T) { for _, test := range tests { room := room(conf{codec: test.codecs[0], game: test.game}) - room.WaitFrames(test.frames) + room.WaitFrame(test.frames) room.Close() } } func TestAll(t *testing.T) { tests := []testParams{ - {game: sushi, frames: 150}, - {game: alwas, frames: 50}, - {game: fd, frames: 50, system: "main-thread"}, + {game: sushi, frames: 150, color: 2}, + {game: alwas, frames: 50, color: 1}, + {game: fd, frames: 50, system: "gl", color: 1}, } crc32q := crc32.MakeTable(0xD5828281) for _, test := range tests { + var frame app.RawFrame room := room(conf{game: test.game, codec: encoder.VP8, autoGlContext: autoGlContext, autoAppStart: false}) - var frame canvas.Frame - if test.system == "main-thread" { - thread.Main(func() { - frame = room.WaitFrames(test.frames) - room.Close() - }) - } else { - frame = room.WaitFrames(test.frames) - room.Close() - } + flip := test.system == "gl" + thread.Main(func() { frame = room.WaitFrame(test.frames) }) + room.Close() + if renderFrames { - tag := fmt.Sprintf("%v-%v-0x%08x", runtime.GOOS, test.game.Type, crc32.Checksum(frame.Pix, crc32q)) - dumpCanvas(&frame, tag, fmt.Sprintf("%v [%v]", tag, test.frames), outputPath) + rect := image.Rect(0, 0, frame.W, frame.H) + var src image.Image + if test.color == 1 { + src1 := bgra.NewBGRA(rect) + src1.Pix = frame.Data + src1.Stride = frame.Stride + src = src1 + } else { + if test.color == 2 { + src1 := rgb565.NewRGB565(rect) + src1.Pix = frame.Data + src1.Stride = frame.Stride + src = src1 + } + } + dst := rgba.ToRGBA(src, flip) + tag := fmt.Sprintf("%v-%v-0x%08x", runtime.GOOS, test.game.Type, crc32.Checksum(frame.Data, crc32q)) + dumpCanvas(dst, tag, fmt.Sprintf("%v [%v]", tag, test.frames), outputPath) } } } -func dumpCanvas(frame *canvas.Frame, name string, caption string, path string) { +func dumpCanvas(frame *image.RGBA, name string, caption string, path string) { // slap 'em caption if caption != "" { draw.Draw(frame, image.Rect(8, 8, 8+len(caption)*7+3, 24), &image.Uniform{C: color.RGBA{}}, image.Point{}, draw.Src) @@ -187,16 +198,17 @@ func room(cfg conf) testRoom { panic(err) } - conf.Worker.Library.BasePath = filepath.FromSlash(root + "/assets/games") + conf.Emulator.Libretro.Cores.Repo.ExtLock = expand("tests", ".cr", "cloud-game.lock") + conf.Emulator.LocalPath = expand("tests", conf.Emulator.LocalPath) + conf.Emulator.Storage = expand("tests", "storage") + + conf.Encoder.Video.Codec = string(cfg.codec) - fixEmulators(&conf, cfg.autoGlContext) l := logger.NewConsole(conf.Worker.Debug, "w", false) if cfg.noLog { logger.SetGlobalLevel(logger.Disabled) } - conf.Encoder.Video.Codec = string(cfg.codec) - id := cfg.roomName if id == "" { id = games.GenerateRoomID(cfg.game.Name) @@ -218,6 +230,7 @@ func room(cfg conf) testRoom { m.AudioSrcHz = emu.AudioSampleRate() m.AudioFrame = conf.Encoder.Audio.Frame m.VideoW, m.VideoH = emu.ViewportSize() + m.VideoScale = emu.Scale() if err := m.Init(); err != nil { l.Fatal().Err(err).Msgf("no init") } @@ -230,22 +243,6 @@ func room(cfg conf) testRoom { return testRoom{Room: room, started: cfg.autoAppStart} } -// fixEmulators makes absolute game paths in global GameList and passes GL context config. -// hack: emulator paths should be absolute and visible to the tests. -func fixEmulators(config *config.WorkerConfig, autoGlContext bool) { - config.Emulator.Libretro.Cores.Paths.Libs = - filepath.FromSlash(root + config.Emulator.Libretro.Cores.Paths.Libs) - config.Emulator.LocalPath = filepath.FromSlash(filepath.Join(root, "tests", config.Emulator.LocalPath)) - config.Emulator.Storage = filepath.FromSlash(filepath.Join(root, "tests", "storage")) - - for k, conf := range config.Emulator.Libretro.Cores.List { - if conf.IsGlAllowed && autoGlContext { - conf.AutoGlContext = true - } - config.Emulator.Libretro.Cores.List[k] = conf - } -} - // Measures emulation performance of various // emulators and encoding options. func BenchmarkRoom(b *testing.B) { @@ -263,7 +260,7 @@ func BenchmarkRoom(b *testing.B) { b.StopTimer() room := room(conf{game: bench.game, codec: cod, noLog: true}) b.StartTimer() - room.WaitFrames(bench.frames) + room.WaitFrame(bench.frames) b.StopTimer() room.Room.Close() } @@ -299,3 +296,9 @@ func TestRouter(t *testing.T) { router.SetRoom(nil) router.Close() } + +// expand joins a list of file path elements. +func expand(p ...string) string { + ph, _ := filepath.Abs(filepath.FromSlash(filepath.Join(p...))) + return ph +} diff --git a/pkg/worker/thread/mainthread_darwin_test.go b/pkg/worker/thread/mainthread_darwin_test.go index bab4a92c3..15ce9328f 100644 --- a/pkg/worker/thread/mainthread_darwin_test.go +++ b/pkg/worker/thread/mainthread_darwin_test.go @@ -1,16 +1,14 @@ package thread -import "testing" +import ( + "os" + "testing" +) -func init() { - runtime.LockOSThread() +func TestMain(m *testing.M) { + Wrap(func() { os.Exit(m.Run()) }) } func TestMainThread(t *testing.T) { - value := 0 - fn := func() { value = 1 } - Main(fn) - if value != 1 { - t.Errorf("wrong value %v", value) - } + _ = 10 } diff --git a/test/test.go b/test/test.go new file mode 100644 index 000000000..b80b425ae --- /dev/null +++ b/test/test.go @@ -0,0 +1,17 @@ +package test + +import ( + "os" + "path" + "runtime" +) + +// runs tests from the root dir when imported + +func init() { + _, filename, _, _ := runtime.Caller(0) + dir := path.Join(path.Dir(filename), "..") + if err := os.Chdir(dir); err != nil { + panic(err) + } +} diff --git a/test/testdata/raw/000_name_fourcc_width_height_stride b/test/testdata/raw/000_name_fourcc_width_height_stride new file mode 100644 index 000000000..e69de29bb diff --git a/test/testdata/raw/001_alsa_ABGR_256_240_1024.raw.zip b/test/testdata/raw/001_alsa_ABGR_256_240_1024.raw.zip new file mode 100644 index 0000000000000000000000000000000000000000..a85e7d7bf96eb111c7ba530a210bd2b8a4a034c1 GIT binary patch literal 3748 zcmai%2~-pJx5tC1*jh|oTB%S*YZdF#u!w*NgQB8;R9uS+Iv}DbA|j#!lEJ#5mdzCv zFrwBK2`U;vV5m?6ibQH5;6l&{0RkC9$U4h?(bxCRdH-|%=gpiu_xt&M@0~ek&diy) z%l$0}kKu4Q!#EG~`~x4^?}_|iZN>^qj=ecKIyzyYJ7Pnzx%2#1V$QC!Fz1<$n3JRP z%%69M?(4}t81-)am<2BXYP5bfcwL*M$+~1gR`;lF%fHFPY#rUx$)uc#d6R#&4QfoB z)l7aE(6mnyi6dE2(NjY9{ITNRh=gRPq7V9iB{l4%etWccNs?EE-8gmP-SQUa1y8;s z^Q_MMPUMp>x1L~qmA5_vF%N5gC1oF_*oTZrGnB0rv@pe;qNE}JbUrLc9U@*Ja|SqJ z5r2~LUh#uCW%_K29N|4hGLkYyy=ts{{iDW^E0xAwa4tyw3BUR~FYNjLrC#|++>LaDf zNRusA(QK0A!bU>2b2e){j3cQl21wFvDI9o8 zaI3E+vk=CHT7zrF;2xy!wjALX>a4=-4Pa66wG}ugyDZ)h!{a`S1OHLfZieA&{-?$Y zc5iNNZM`(EYT`W@vbvw8glIr-TU@eoNE3H5J59Cy1Hs%d|fphNLpqG_ifYO zl9=U*R%f{IAh-~(Se=wEg66oiSLaEEKyO-~a3?qc4ScRC2)T#*Fg7Mu9-*~3`xfaH z)!_Y3m7kK-#^g%X=ESl@62&qms_RRLnR74V(6K$ zl|)1v2$iHB@w;(KBqETSBnMLvIlXxAFe4}vtRZGkiKLg3*XyG*;vV6pd+!Ok@k`y$ zp)mvyv$yNPS+&X($3>}15n|hE-YR)La9y)VuA#QDBVJ9y>wWxoKhhf*tw@^OJeiXyOb&CrnguZKT|Z1Zp7W z%Pd(d39_rvj4&A5oqB1*`e~ZN7+tO$SB(pyktuk^Y$irRvtPpRik9kVUdA!%eo{Ge zuV_Xao=jUP2gF_)YIJa7Tzvtdi5Ow?Br(0;-5stOyf?x0?oN18&kT8&zmN70uPxv1 zeQ$Tp6}@`TWf{RSL3&8F2ogh3F$}e&~d7mEstQNLF?+UAK*bT7A}(+kkiQ(+%ENc{9%9*H6I( zx?_be;A~m{oPYyZ&sgp5ZLNNRUEpGzekGHI;)39k4~y9w@!r$=&{c_^!#d%`u@`W& zbQ2xe!6wzL@#lshdGi(BTI?83>txXA>dSRkk)zqTzgFTgZFu$8`Cz29kq#sTX z`pJOXjYGEftX%S!?VBP~^b9h}t^Gc0dMNv1q^8UP79tE=O;e-BZtcCZJ9eB-I;Ojy z?D9|&B6emZO4bq$egr&X1<&pl`OiKw60DOZ1yyUje7`SH63@i%mHv+fP)^%=X{n!8 z%RO)}zGsV06mU*0f^jBBa2KUb!o>dQfdGP(`!X`wh`&hCdiPKDlPeMr7s!jQN1Qo%w26yBXd_UL_Rtc9ApyVZTz|6J;#8ijx#|F z6H}+PSZ6>Q*gJNH=(!UZys6mycI=>+Uj(u2O&sSX;FdHuLRbSr=Lvg!?B_QienWe2 z1X#Tbd}!B0$62=vY73BCIxK}n7?|uSrgpJQ%VonnL1Dq52Ka@FKdh5@uv~`aZDNxb zslDU|poyqUm$TxV-4mPDWScQYaN{Dn;yWGnlNeqCw_l*TaTdN!uhAJ5(wzd77)1u0 z4?Y-fS=6eT4n+7q7a*V!XS(E0kX!wmjs4+SCVk+ zt5w`EY#+Z#eQt{a)k~wbZVUy_=yzX5(LIukUWwD)vp1YsLym;|J*zwJ{bmIkSTLW? zSnxFyGucjG`jL_GH)?uc#rNF82>Uz*2=^}(43D+QB?fKpgj~tan^Z>Z>(}2N<%3u+ ze7o_$0BON~c{2|(3I%Hhul5uiS7uIS#hPW)T&ge&*8$*Uda%5{b3J^9NFHaxIX1zF zjk8YNm))SdsD)1_wFn96-SU_(z;B4P#X}k5#B6if*PD{^iXr&|kcex5jCkpSYZ=S>PDnXBL<`twk){Xqdl>=9FSDFKxjNG9rqy9dkZit{ zq2UuXfvsp;4vVjS*G!+SZ)X>BOP>&rdw84wz64|o7R(~|kJo_=tBgHRGi55^vT`7O ze;H{6>QOsPc#uqkwwy-u+g6!Z?p2k)FC1mWeON>{+70iA%ym6__nRyw)wn z_Oim~Cn-&WXP4>m{o}c)5!y63nIodvjkCXvV}&?llAqMzYd)H%0`;=g|)mWjk0lb~v^h zp=5ya`GL^{uIB-KBd0WJ_50pxfO^a%Ui!TTlxmK57V-$^Y<6;hXy9}{2w-Kb#mh~s zY-qIxbll{x%CkUZ_FX>vHRF9*SuEZ;9=Ja`05{s-4R6@RJU~~U$&?}6F~Dz5mD9GU4c7Vi!Ckl{d^({6Ve{ zE)o-LNhhGEG4POU;t;;ZVgMv#yP^e7xDEM$^0J05jjJA#Cm$Fv3lrXniJz9xthu46Jr4>8+LHZEw z`>tZ-M}R%?Ioq};C0#7zV;cm1P5P4x++?WuGfITPIgOtYY_&+X<#aoVpnPngb1882zfjs-s#UGrc;n&#`m&? zvus`vS3}064%*G(Ecdq