/*
|
* Copyright 2016 Google Inc.
|
*
|
* Use of this source code is governed by a BSD-style license that can be
|
* found in the LICENSE file.
|
*/
|
|
#ifndef SkSwizzler_opts_DEFINED
|
#define SkSwizzler_opts_DEFINED
|
|
#include "SkColorData.h"
|
|
#include <utility>
|
|
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
|
#include <immintrin.h>
|
#elif defined(SK_ARM_HAS_NEON)
|
#include <arm_neon.h>
|
#endif
|
|
namespace SK_OPTS_NS {
|
|
static void RGBA_to_rgbA_portable(uint32_t* dst, const uint32_t* src, int count) {
|
for (int i = 0; i < count; i++) {
|
uint8_t a = src[i] >> 24,
|
b = src[i] >> 16,
|
g = src[i] >> 8,
|
r = src[i] >> 0;
|
b = (b*a+127)/255;
|
g = (g*a+127)/255;
|
r = (r*a+127)/255;
|
dst[i] = (uint32_t)a << 24
|
| (uint32_t)b << 16
|
| (uint32_t)g << 8
|
| (uint32_t)r << 0;
|
}
|
}
|
|
static void RGBA_to_bgrA_portable(uint32_t* dst, const uint32_t* src, int count) {
|
for (int i = 0; i < count; i++) {
|
uint8_t a = src[i] >> 24,
|
b = src[i] >> 16,
|
g = src[i] >> 8,
|
r = src[i] >> 0;
|
b = (b*a+127)/255;
|
g = (g*a+127)/255;
|
r = (r*a+127)/255;
|
dst[i] = (uint32_t)a << 24
|
| (uint32_t)r << 16
|
| (uint32_t)g << 8
|
| (uint32_t)b << 0;
|
}
|
}
|
|
static void RGBA_to_BGRA_portable(uint32_t* dst, const uint32_t* src, int count) {
|
for (int i = 0; i < count; i++) {
|
uint8_t a = src[i] >> 24,
|
b = src[i] >> 16,
|
g = src[i] >> 8,
|
r = src[i] >> 0;
|
dst[i] = (uint32_t)a << 24
|
| (uint32_t)r << 16
|
| (uint32_t)g << 8
|
| (uint32_t)b << 0;
|
}
|
}
|
|
static void RGB_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) {
|
for (int i = 0; i < count; i++) {
|
uint8_t r = src[0],
|
g = src[1],
|
b = src[2];
|
src += 3;
|
dst[i] = (uint32_t)0xFF << 24
|
| (uint32_t)b << 16
|
| (uint32_t)g << 8
|
| (uint32_t)r << 0;
|
}
|
}
|
|
static void RGB_to_BGR1_portable(uint32_t dst[], const uint8_t* src, int count) {
|
for (int i = 0; i < count; i++) {
|
uint8_t r = src[0],
|
g = src[1],
|
b = src[2];
|
src += 3;
|
dst[i] = (uint32_t)0xFF << 24
|
| (uint32_t)r << 16
|
| (uint32_t)g << 8
|
| (uint32_t)b << 0;
|
}
|
}
|
|
static void gray_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) {
|
for (int i = 0; i < count; i++) {
|
dst[i] = (uint32_t)0xFF << 24
|
| (uint32_t)src[i] << 16
|
| (uint32_t)src[i] << 8
|
| (uint32_t)src[i] << 0;
|
}
|
}
|
|
static void grayA_to_RGBA_portable(uint32_t dst[], const uint8_t* src, int count) {
|
for (int i = 0; i < count; i++) {
|
uint8_t g = src[0],
|
a = src[1];
|
src += 2;
|
dst[i] = (uint32_t)a << 24
|
| (uint32_t)g << 16
|
| (uint32_t)g << 8
|
| (uint32_t)g << 0;
|
}
|
}
|
|
static void grayA_to_rgbA_portable(uint32_t dst[], const uint8_t* src, int count) {
|
for (int i = 0; i < count; i++) {
|
uint8_t g = src[0],
|
a = src[1];
|
src += 2;
|
g = (g*a+127)/255;
|
dst[i] = (uint32_t)a << 24
|
| (uint32_t)g << 16
|
| (uint32_t)g << 8
|
| (uint32_t)g << 0;
|
}
|
}
|
|
static void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const uint32_t* src, int count) {
|
for (int i = 0; i < count; i++) {
|
uint8_t k = src[i] >> 24,
|
y = src[i] >> 16,
|
m = src[i] >> 8,
|
c = src[i] >> 0;
|
// See comments in SkSwizzler.cpp for details on the conversion formula.
|
uint8_t b = (y*k+127)/255,
|
g = (m*k+127)/255,
|
r = (c*k+127)/255;
|
dst[i] = (uint32_t)0xFF << 24
|
| (uint32_t) b << 16
|
| (uint32_t) g << 8
|
| (uint32_t) r << 0;
|
}
|
}
|
|
static void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const uint32_t* src, int count) {
|
for (int i = 0; i < count; i++) {
|
uint8_t k = src[i] >> 24,
|
y = src[i] >> 16,
|
m = src[i] >> 8,
|
c = src[i] >> 0;
|
uint8_t b = (y*k+127)/255,
|
g = (m*k+127)/255,
|
r = (c*k+127)/255;
|
dst[i] = (uint32_t)0xFF << 24
|
| (uint32_t) r << 16
|
| (uint32_t) g << 8
|
| (uint32_t) b << 0;
|
}
|
}
|
|
#if defined(SK_ARM_HAS_NEON)
|
|
// Rounded divide by 255, (x + 127) / 255
|
static uint8x8_t div255_round(uint16x8_t x) {
|
// result = (x + 127) / 255
|
// result = (x + 127) / 256 + error1
|
//
|
// error1 = (x + 127) / (255 * 256)
|
// error1 = (x + 127) / (256 * 256) + error2
|
//
|
// error2 = (x + 127) / (255 * 256 * 256)
|
//
|
// The maximum value of error2 is too small to matter. Thus:
|
// result = (x + 127) / 256 + (x + 127) / (256 * 256)
|
// result = ((x + 127) / 256 + x + 127) / 256
|
// result = ((x + 127) >> 8 + x + 127) >> 8
|
//
|
// Use >>> to represent "rounded right shift" which, conveniently,
|
// NEON supports in one instruction.
|
// result = ((x >>> 8) + x) >>> 8
|
//
|
// Note that the second right shift is actually performed as an
|
// "add, round, and narrow back to 8-bits" instruction.
|
return vraddhn_u16(x, vrshrq_n_u16(x, 8));
|
}
|
|
// Scale a byte by another, (x * y + 127) / 255
|
static uint8x8_t scale(uint8x8_t x, uint8x8_t y) {
|
return div255_round(vmull_u8(x, y));
|
}
|
|
template <bool kSwapRB>
|
static void premul_should_swapRB(uint32_t* dst, const uint32_t* src, int count) {
|
while (count >= 8) {
|
// Load 8 pixels.
|
uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
|
|
uint8x8_t a = rgba.val[3],
|
b = rgba.val[2],
|
g = rgba.val[1],
|
r = rgba.val[0];
|
|
// Premultiply.
|
b = scale(b, a);
|
g = scale(g, a);
|
r = scale(r, a);
|
|
// Store 8 premultiplied pixels.
|
if (kSwapRB) {
|
rgba.val[2] = r;
|
rgba.val[1] = g;
|
rgba.val[0] = b;
|
} else {
|
rgba.val[2] = b;
|
rgba.val[1] = g;
|
rgba.val[0] = r;
|
}
|
vst4_u8((uint8_t*) dst, rgba);
|
src += 8;
|
dst += 8;
|
count -= 8;
|
}
|
|
// Call portable code to finish up the tail of [0,8) pixels.
|
auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
|
proc(dst, src, count);
|
}
|
|
/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
|
premul_should_swapRB<false>(dst, src, count);
|
}
|
|
/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
|
premul_should_swapRB<true>(dst, src, count);
|
}
|
|
/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
|
using std::swap;
|
while (count >= 16) {
|
// Load 16 pixels.
|
uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);
|
|
// Swap r and b.
|
swap(rgba.val[0], rgba.val[2]);
|
|
// Store 16 pixels.
|
vst4q_u8((uint8_t*) dst, rgba);
|
src += 16;
|
dst += 16;
|
count -= 16;
|
}
|
|
if (count >= 8) {
|
// Load 8 pixels.
|
uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
|
|
// Swap r and b.
|
swap(rgba.val[0], rgba.val[2]);
|
|
// Store 8 pixels.
|
vst4_u8((uint8_t*) dst, rgba);
|
src += 8;
|
dst += 8;
|
count -= 8;
|
}
|
|
RGBA_to_BGRA_portable(dst, src, count);
|
}
|
|
template <bool kSwapRB>
|
static void insert_alpha_should_swaprb(uint32_t dst[], const uint8_t* src, int count) {
|
while (count >= 16) {
|
// Load 16 pixels.
|
uint8x16x3_t rgb = vld3q_u8(src);
|
|
// Insert an opaque alpha channel and swap if needed.
|
uint8x16x4_t rgba;
|
if (kSwapRB) {
|
rgba.val[0] = rgb.val[2];
|
rgba.val[2] = rgb.val[0];
|
} else {
|
rgba.val[0] = rgb.val[0];
|
rgba.val[2] = rgb.val[2];
|
}
|
rgba.val[1] = rgb.val[1];
|
rgba.val[3] = vdupq_n_u8(0xFF);
|
|
// Store 16 pixels.
|
vst4q_u8((uint8_t*) dst, rgba);
|
src += 16*3;
|
dst += 16;
|
count -= 16;
|
}
|
|
if (count >= 8) {
|
// Load 8 pixels.
|
uint8x8x3_t rgb = vld3_u8(src);
|
|
// Insert an opaque alpha channel and swap if needed.
|
uint8x8x4_t rgba;
|
if (kSwapRB) {
|
rgba.val[0] = rgb.val[2];
|
rgba.val[2] = rgb.val[0];
|
} else {
|
rgba.val[0] = rgb.val[0];
|
rgba.val[2] = rgb.val[2];
|
}
|
rgba.val[1] = rgb.val[1];
|
rgba.val[3] = vdup_n_u8(0xFF);
|
|
// Store 8 pixels.
|
vst4_u8((uint8_t*) dst, rgba);
|
src += 8*3;
|
dst += 8;
|
count -= 8;
|
}
|
|
// Call portable code to finish up the tail of [0,8) pixels.
|
auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
|
proc(dst, src, count);
|
}
|
|
/*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
|
insert_alpha_should_swaprb<false>(dst, src, count);
|
}
|
|
/*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
|
insert_alpha_should_swaprb<true>(dst, src, count);
|
}
|
|
/*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
|
while (count >= 16) {
|
// Load 16 pixels.
|
uint8x16_t gray = vld1q_u8(src);
|
|
// Set each of the color channels.
|
uint8x16x4_t rgba;
|
rgba.val[0] = gray;
|
rgba.val[1] = gray;
|
rgba.val[2] = gray;
|
rgba.val[3] = vdupq_n_u8(0xFF);
|
|
// Store 16 pixels.
|
vst4q_u8((uint8_t*) dst, rgba);
|
src += 16;
|
dst += 16;
|
count -= 16;
|
}
|
|
if (count >= 8) {
|
// Load 8 pixels.
|
uint8x8_t gray = vld1_u8(src);
|
|
// Set each of the color channels.
|
uint8x8x4_t rgba;
|
rgba.val[0] = gray;
|
rgba.val[1] = gray;
|
rgba.val[2] = gray;
|
rgba.val[3] = vdup_n_u8(0xFF);
|
|
// Store 8 pixels.
|
vst4_u8((uint8_t*) dst, rgba);
|
src += 8;
|
dst += 8;
|
count -= 8;
|
}
|
|
gray_to_RGB1_portable(dst, src, count);
|
}
|
|
template <bool kPremul>
|
static void expand_grayA(uint32_t dst[], const uint8_t* src, int count) {
|
while (count >= 16) {
|
// Load 16 pixels.
|
uint8x16x2_t ga = vld2q_u8(src);
|
|
// Premultiply if requested.
|
if (kPremul) {
|
ga.val[0] = vcombine_u8(
|
scale(vget_low_u8(ga.val[0]), vget_low_u8(ga.val[1])),
|
scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1])));
|
}
|
|
// Set each of the color channels.
|
uint8x16x4_t rgba;
|
rgba.val[0] = ga.val[0];
|
rgba.val[1] = ga.val[0];
|
rgba.val[2] = ga.val[0];
|
rgba.val[3] = ga.val[1];
|
|
// Store 16 pixels.
|
vst4q_u8((uint8_t*) dst, rgba);
|
src += 16*2;
|
dst += 16;
|
count -= 16;
|
}
|
|
if (count >= 8) {
|
// Load 8 pixels.
|
uint8x8x2_t ga = vld2_u8(src);
|
|
// Premultiply if requested.
|
if (kPremul) {
|
ga.val[0] = scale(ga.val[0], ga.val[1]);
|
}
|
|
// Set each of the color channels.
|
uint8x8x4_t rgba;
|
rgba.val[0] = ga.val[0];
|
rgba.val[1] = ga.val[0];
|
rgba.val[2] = ga.val[0];
|
rgba.val[3] = ga.val[1];
|
|
// Store 8 pixels.
|
vst4_u8((uint8_t*) dst, rgba);
|
src += 8*2;
|
dst += 8;
|
count -= 8;
|
}
|
|
auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable;
|
proc(dst, src, count);
|
}
|
|
/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
|
expand_grayA<false>(dst, src, count);
|
}
|
|
/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
|
expand_grayA<true>(dst, src, count);
|
}
|
|
enum Format { kRGB1, kBGR1 };
|
template <Format format>
|
static void inverted_cmyk_to(uint32_t* dst, const uint32_t* src, int count) {
|
while (count >= 8) {
|
// Load 8 cmyk pixels.
|
uint8x8x4_t pixels = vld4_u8((const uint8_t*) src);
|
|
uint8x8_t k = pixels.val[3],
|
y = pixels.val[2],
|
m = pixels.val[1],
|
c = pixels.val[0];
|
|
// Scale to r, g, b.
|
uint8x8_t b = scale(y, k);
|
uint8x8_t g = scale(m, k);
|
uint8x8_t r = scale(c, k);
|
|
// Store 8 rgba pixels.
|
if (kBGR1 == format) {
|
pixels.val[3] = vdup_n_u8(0xFF);
|
pixels.val[2] = r;
|
pixels.val[1] = g;
|
pixels.val[0] = b;
|
} else {
|
pixels.val[3] = vdup_n_u8(0xFF);
|
pixels.val[2] = b;
|
pixels.val[1] = g;
|
pixels.val[0] = r;
|
}
|
vst4_u8((uint8_t*) dst, pixels);
|
src += 8;
|
dst += 8;
|
count -= 8;
|
}
|
|
auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
|
proc(dst, src, count);
|
}
|
|
/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
|
inverted_cmyk_to<kRGB1>(dst, src, count);
|
}
|
|
/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
|
inverted_cmyk_to<kBGR1>(dst, src, count);
|
}
|
|
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
|
|
// Scale a byte by another.
|
// Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
|
static __m128i scale(__m128i x, __m128i y) {
|
const __m128i _128 = _mm_set1_epi16(128);
|
const __m128i _257 = _mm_set1_epi16(257);
|
|
// (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
|
return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
|
}
|
|
template <bool kSwapRB>
|
static void premul_should_swapRB(uint32_t* dst, const uint32_t* src, int count) {
|
|
auto premul8 = [](__m128i* lo, __m128i* hi) {
|
const __m128i zeros = _mm_setzero_si128();
|
__m128i planar;
|
if (kSwapRB) {
|
planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
|
} else {
|
planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
|
}
|
|
// Swizzle the pixels to 8-bit planar.
|
*lo = _mm_shuffle_epi8(*lo, planar); // rrrrgggg bbbbaaaa
|
*hi = _mm_shuffle_epi8(*hi, planar); // RRRRGGGG BBBBAAAA
|
__m128i rg = _mm_unpacklo_epi32(*lo, *hi), // rrrrRRRR ggggGGGG
|
ba = _mm_unpackhi_epi32(*lo, *hi); // bbbbBBBB aaaaAAAA
|
|
// Unpack to 16-bit planar.
|
__m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_R_R_R_
|
g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_G_G_G_
|
b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_B_B_B_
|
a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_A_A_A_
|
|
// Premultiply!
|
r = scale(r, a);
|
g = scale(g, a);
|
b = scale(b, a);
|
|
// Repack into interlaced pixels.
|
rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RGRGRGRG
|
ba = _mm_or_si128(b, _mm_slli_epi16(a, 8)); // babababa BABABABA
|
*lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba
|
*hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RGBARGBA
|
};
|
|
while (count >= 8) {
|
__m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
|
hi = _mm_loadu_si128((const __m128i*) (src + 4));
|
|
premul8(&lo, &hi);
|
|
_mm_storeu_si128((__m128i*) (dst + 0), lo);
|
_mm_storeu_si128((__m128i*) (dst + 4), hi);
|
|
src += 8;
|
dst += 8;
|
count -= 8;
|
}
|
|
if (count >= 4) {
|
__m128i lo = _mm_loadu_si128((const __m128i*) src),
|
hi = _mm_setzero_si128();
|
|
premul8(&lo, &hi);
|
|
_mm_storeu_si128((__m128i*) dst, lo);
|
|
src += 4;
|
dst += 4;
|
count -= 4;
|
}
|
|
// Call portable code to finish up the tail of [0,4) pixels.
|
auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
|
proc(dst, src, count);
|
}
|
|
/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
|
premul_should_swapRB<false>(dst, src, count);
|
}
|
|
/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
|
premul_should_swapRB<true>(dst, src, count);
|
}
|
|
/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
|
const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
|
|
while (count >= 4) {
|
__m128i rgba = _mm_loadu_si128((const __m128i*) src);
|
__m128i bgra = _mm_shuffle_epi8(rgba, swapRB);
|
_mm_storeu_si128((__m128i*) dst, bgra);
|
|
src += 4;
|
dst += 4;
|
count -= 4;
|
}
|
|
RGBA_to_BGRA_portable(dst, src, count);
|
}
|
|
template <bool kSwapRB>
|
static void insert_alpha_should_swaprb(uint32_t dst[], const uint8_t* src, int count) {
|
const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
|
__m128i expand;
|
const uint8_t X = 0xFF; // Used a placeholder. The value of X is irrelevant.
|
if (kSwapRB) {
|
expand = _mm_setr_epi8(2,1,0,X, 5,4,3,X, 8,7,6,X, 11,10,9,X);
|
} else {
|
expand = _mm_setr_epi8(0,1,2,X, 3,4,5,X, 6,7,8,X, 9,10,11,X);
|
}
|
|
while (count >= 6) {
|
// Load a vector. While this actually contains 5 pixels plus an
|
// extra component, we will discard all but the first four pixels on
|
// this iteration.
|
__m128i rgb = _mm_loadu_si128((const __m128i*) src);
|
|
// Expand the first four pixels to RGBX and then mask to RGB(FF).
|
__m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask);
|
|
// Store 4 pixels.
|
_mm_storeu_si128((__m128i*) dst, rgba);
|
|
src += 4*3;
|
dst += 4;
|
count -= 4;
|
}
|
|
// Call portable code to finish up the tail of [0,4) pixels.
|
auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
|
proc(dst, src, count);
|
}
|
|
/*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
|
insert_alpha_should_swaprb<false>(dst, src, count);
|
}
|
|
/*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
|
insert_alpha_should_swaprb<true>(dst, src, count);
|
}
|
|
/*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
|
const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF);
|
while (count >= 16) {
|
__m128i grays = _mm_loadu_si128((const __m128i*) src);
|
|
__m128i gg_lo = _mm_unpacklo_epi8(grays, grays);
|
__m128i gg_hi = _mm_unpackhi_epi8(grays, grays);
|
__m128i ga_lo = _mm_unpacklo_epi8(grays, alphas);
|
__m128i ga_hi = _mm_unpackhi_epi8(grays, alphas);
|
|
__m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo);
|
__m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo);
|
__m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi);
|
__m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi);
|
|
_mm_storeu_si128((__m128i*) (dst + 0), ggga0);
|
_mm_storeu_si128((__m128i*) (dst + 4), ggga1);
|
_mm_storeu_si128((__m128i*) (dst + 8), ggga2);
|
_mm_storeu_si128((__m128i*) (dst + 12), ggga3);
|
|
src += 16;
|
dst += 16;
|
count -= 16;
|
}
|
|
gray_to_RGB1_portable(dst, src, count);
|
}
|
|
/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
|
while (count >= 8) {
|
__m128i ga = _mm_loadu_si128((const __m128i*) src);
|
|
__m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)),
|
_mm_slli_epi16(ga, 8));
|
|
__m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
|
__m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
|
|
_mm_storeu_si128((__m128i*) (dst + 0), ggga_lo);
|
_mm_storeu_si128((__m128i*) (dst + 4), ggga_hi);
|
|
src += 8*2;
|
dst += 8;
|
count -= 8;
|
}
|
|
grayA_to_RGBA_portable(dst, src, count);
|
}
|
|
/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
|
while (count >= 8) {
|
__m128i grayA = _mm_loadu_si128((const __m128i*) src);
|
|
__m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF));
|
__m128i a0 = _mm_srli_epi16(grayA, 8);
|
|
// Premultiply
|
g0 = scale(g0, a0);
|
|
__m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8));
|
__m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8));
|
|
|
__m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
|
__m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
|
|
_mm_storeu_si128((__m128i*) (dst + 0), ggga_lo);
|
_mm_storeu_si128((__m128i*) (dst + 4), ggga_hi);
|
|
src += 8*2;
|
dst += 8;
|
count -= 8;
|
}
|
|
grayA_to_rgbA_portable(dst, src, count);
|
}
|
|
enum Format { kRGB1, kBGR1 };
|
template <Format format>
|
static void inverted_cmyk_to(uint32_t* dst, const uint32_t* src, int count) {
|
auto convert8 = [](__m128i* lo, __m128i* hi) {
|
const __m128i zeros = _mm_setzero_si128();
|
__m128i planar;
|
if (kBGR1 == format) {
|
planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
|
} else {
|
planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
|
}
|
|
// Swizzle the pixels to 8-bit planar.
|
*lo = _mm_shuffle_epi8(*lo, planar); // ccccmmmm yyyykkkk
|
*hi = _mm_shuffle_epi8(*hi, planar); // CCCCMMMM YYYYKKKK
|
__m128i cm = _mm_unpacklo_epi32(*lo, *hi), // ccccCCCC mmmmMMMM
|
yk = _mm_unpackhi_epi32(*lo, *hi); // yyyyYYYY kkkkKKKK
|
|
// Unpack to 16-bit planar.
|
__m128i c = _mm_unpacklo_epi8(cm, zeros), // c_c_c_c_ C_C_C_C_
|
m = _mm_unpackhi_epi8(cm, zeros), // m_m_m_m_ M_M_M_M_
|
y = _mm_unpacklo_epi8(yk, zeros), // y_y_y_y_ Y_Y_Y_Y_
|
k = _mm_unpackhi_epi8(yk, zeros); // k_k_k_k_ K_K_K_K_
|
|
// Scale to r, g, b.
|
__m128i r = scale(c, k),
|
g = scale(m, k),
|
b = scale(y, k);
|
|
// Repack into interlaced pixels.
|
__m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)), // rgrgrgrg RGRGRGRG
|
ba = _mm_or_si128(b, _mm_set1_epi16((uint16_t) 0xFF00)); // b1b1b1b1 B1B1B1B1
|
*lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba
|
*hi = _mm_unpackhi_epi16(rg, ba); // RGB1RGB1 RGB1RGB1
|
};
|
|
while (count >= 8) {
|
__m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
|
hi = _mm_loadu_si128((const __m128i*) (src + 4));
|
|
convert8(&lo, &hi);
|
|
_mm_storeu_si128((__m128i*) (dst + 0), lo);
|
_mm_storeu_si128((__m128i*) (dst + 4), hi);
|
|
src += 8;
|
dst += 8;
|
count -= 8;
|
}
|
|
if (count >= 4) {
|
__m128i lo = _mm_loadu_si128((const __m128i*) src),
|
hi = _mm_setzero_si128();
|
|
convert8(&lo, &hi);
|
|
_mm_storeu_si128((__m128i*) dst, lo);
|
|
src += 4;
|
dst += 4;
|
count -= 4;
|
}
|
|
auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
|
proc(dst, src, count);
|
}
|
|
/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
|
inverted_cmyk_to<kRGB1>(dst, src, count);
|
}
|
|
/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
|
inverted_cmyk_to<kBGR1>(dst, src, count);
|
}
|
|
#else
|
|
/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
|
RGBA_to_rgbA_portable(dst, src, count);
|
}
|
|
/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
|
RGBA_to_bgrA_portable(dst, src, count);
|
}
|
|
/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
|
RGBA_to_BGRA_portable(dst, src, count);
|
}
|
|
/*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
|
RGB_to_RGB1_portable(dst, src, count);
|
}
|
|
/*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
|
RGB_to_BGR1_portable(dst, src, count);
|
}
|
|
/*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
|
gray_to_RGB1_portable(dst, src, count);
|
}
|
|
/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
|
grayA_to_RGBA_portable(dst, src, count);
|
}
|
|
/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
|
grayA_to_rgbA_portable(dst, src, count);
|
}
|
|
/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
|
inverted_CMYK_to_RGB1_portable(dst, src, count);
|
}
|
|
/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
|
inverted_CMYK_to_BGR1_portable(dst, src, count);
|
}
|
|
#endif
|
|
}
|
|
#endif // SkSwizzler_opts_DEFINED
|