Blame - third_party/libyuv/source/row_common.cc - aom

blob: ceb3836cdd7d36ed81825290d959bca98d28bfe1 [file] [log] [blame]

Deb Mukherjee	47031c0	2014-05-16 18:52:01 -0700	[diff] [blame^]	1	/*
				2	* Copyright 2011 The LibYuv Project Authors. All rights reserved.
				3	*
				4	* Use of this source code is governed by a BSD-style license
				5	* that can be found in the LICENSE file in the root of the source
				6	* tree. An additional intellectual property rights grant can be found
				7	* in the file PATENTS. All contributing project authors may
				8	* be found in the AUTHORS file in the root of the source tree.
				9	*/
				10
				11	#include "third_party/libyuv/include/libyuv/row.h"
				12
				13	#include <string.h> // For memcpy and memset.
				14
				15	#include "third_party/libyuv/include/libyuv/basic_types.h"
				16
				17	#ifdef __cplusplus
				18	namespace libyuv {
				19	extern "C" {
				20	#endif
				21
				22	// llvm x86 is poor at ternary operator, so use branchless min/max.
				23
				24	#define USE_BRANCHLESS 1
				25	#if USE_BRANCHLESS
				26	static __inline int32 clamp0(int32 v) {
				27	return ((-(v) >> 31) & (v));
				28	}
				29
				30	static __inline int32 clamp255(int32 v) {
				31	return (((255 - (v)) >> 31) \| (v)) & 255;
				32	}
				33
				34	static __inline uint32 Clamp(int32 val) {
				35	int v = clamp0(val);
				36	return (uint32)(clamp255(v));
				37	}
				38
				39	static __inline uint32 Abs(int32 v) {
				40	int m = v >> 31;
				41	return (v + m) ^ m;
				42	}
				43	#else // USE_BRANCHLESS
				44	static __inline int32 clamp0(int32 v) {
				45	return (v < 0) ? 0 : v;
				46	}
				47
				48	static __inline int32 clamp255(int32 v) {
				49	return (v > 255) ? 255 : v;
				50	}
				51
				52	static __inline uint32 Clamp(int32 val) {
				53	int v = clamp0(val);
				54	return (uint32)(clamp255(v));
				55	}
				56
				57	static __inline uint32 Abs(int32 v) {
				58	return (v < 0) ? -v : v;
				59	}
				60	#endif // USE_BRANCHLESS
				61
				62	#ifdef LIBYUV_LITTLE_ENDIAN
				63	#define WRITEWORD(p, v) (uint32)(p) = v
				64	#else
				65	static inline void WRITEWORD(uint8* p, uint32 v) {
				66	p[0] = (uint8)(v & 255);
				67	p[1] = (uint8)((v >> 8) & 255);
				68	p[2] = (uint8)((v >> 16) & 255);
				69	p[3] = (uint8)((v >> 24) & 255);
				70	}
				71	#endif
				72
				73	void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
				74	int x;
				75	for (x = 0; x < width; ++x) {
				76	uint8 b = src_rgb24[0];
				77	uint8 g = src_rgb24[1];
				78	uint8 r = src_rgb24[2];
				79	dst_argb[0] = b;
				80	dst_argb[1] = g;
				81	dst_argb[2] = r;
				82	dst_argb[3] = 255u;
				83	dst_argb += 4;
				84	src_rgb24 += 3;
				85	}
				86	}
				87
				88	void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
				89	int x;
				90	for (x = 0; x < width; ++x) {
				91	uint8 r = src_raw[0];
				92	uint8 g = src_raw[1];
				93	uint8 b = src_raw[2];
				94	dst_argb[0] = b;
				95	dst_argb[1] = g;
				96	dst_argb[2] = r;
				97	dst_argb[3] = 255u;
				98	dst_argb += 4;
				99	src_raw += 3;
				100	}
				101	}
				102
				103	void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {
				104	int x;
				105	for (x = 0; x < width; ++x) {
				106	uint8 b = src_rgb565[0] & 0x1f;
				107	uint8 g = (src_rgb565[0] >> 5) \| ((src_rgb565[1] & 0x07) << 3);
				108	uint8 r = src_rgb565[1] >> 3;
				109	dst_argb[0] = (b << 3) \| (b >> 2);
				110	dst_argb[1] = (g << 2) \| (g >> 4);
				111	dst_argb[2] = (r << 3) \| (r >> 2);
				112	dst_argb[3] = 255u;
				113	dst_argb += 4;
				114	src_rgb565 += 2;
				115	}
				116	}
				117
				118	void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb,
				119	int width) {
				120	int x;
				121	for (x = 0; x < width; ++x) {
				122	uint8 b = src_argb1555[0] & 0x1f;
				123	uint8 g = (src_argb1555[0] >> 5) \| ((src_argb1555[1] & 0x03) << 3);
				124	uint8 r = (src_argb1555[1] & 0x7c) >> 2;
				125	uint8 a = src_argb1555[1] >> 7;
				126	dst_argb[0] = (b << 3) \| (b >> 2);
				127	dst_argb[1] = (g << 3) \| (g >> 2);
				128	dst_argb[2] = (r << 3) \| (r >> 2);
				129	dst_argb[3] = -a;
				130	dst_argb += 4;
				131	src_argb1555 += 2;
				132	}
				133	}
				134
				135	void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb,
				136	int width) {
				137	int x;
				138	for (x = 0; x < width; ++x) {
				139	uint8 b = src_argb4444[0] & 0x0f;
				140	uint8 g = src_argb4444[0] >> 4;
				141	uint8 r = src_argb4444[1] & 0x0f;
				142	uint8 a = src_argb4444[1] >> 4;
				143	dst_argb[0] = (b << 4) \| b;
				144	dst_argb[1] = (g << 4) \| g;
				145	dst_argb[2] = (r << 4) \| r;
				146	dst_argb[3] = (a << 4) \| a;
				147	dst_argb += 4;
				148	src_argb4444 += 2;
				149	}
				150	}
				151
				152	void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
				153	int x;
				154	for (x = 0; x < width; ++x) {
				155	uint8 b = src_argb[0];
				156	uint8 g = src_argb[1];
				157	uint8 r = src_argb[2];
				158	dst_rgb[0] = b;
				159	dst_rgb[1] = g;
				160	dst_rgb[2] = r;
				161	dst_rgb += 3;
				162	src_argb += 4;
				163	}
				164	}
				165
				166	void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
				167	int x;
				168	for (x = 0; x < width; ++x) {
				169	uint8 b = src_argb[0];
				170	uint8 g = src_argb[1];
				171	uint8 r = src_argb[2];
				172	dst_rgb[0] = r;
				173	dst_rgb[1] = g;
				174	dst_rgb[2] = b;
				175	dst_rgb += 3;
				176	src_argb += 4;
				177	}
				178	}
				179
				180	void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
				181	int x;
				182	for (x = 0; x < width - 1; x += 2) {
				183	uint8 b0 = src_argb[0] >> 3;
				184	uint8 g0 = src_argb[1] >> 2;
				185	uint8 r0 = src_argb[2] >> 3;
				186	uint8 b1 = src_argb[4] >> 3;
				187	uint8 g1 = src_argb[5] >> 2;
				188	uint8 r1 = src_argb[6] >> 3;
				189	WRITEWORD(dst_rgb, b0 \| (g0 << 5) \| (r0 << 11) \|
				190	(b1 << 16) \| (g1 << 21) \| (r1 << 27));
				191	dst_rgb += 4;
				192	src_argb += 8;
				193	}
				194	if (width & 1) {
				195	uint8 b0 = src_argb[0] >> 3;
				196	uint8 g0 = src_argb[1] >> 2;
				197	uint8 r0 = src_argb[2] >> 3;
				198	(uint16)(dst_rgb) = b0 \| (g0 << 5) \| (r0 << 11);
				199	}
				200	}
				201
				202	void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
				203	int x;
				204	for (x = 0; x < width - 1; x += 2) {
				205	uint8 b0 = src_argb[0] >> 3;
				206	uint8 g0 = src_argb[1] >> 3;
				207	uint8 r0 = src_argb[2] >> 3;
				208	uint8 a0 = src_argb[3] >> 7;
				209	uint8 b1 = src_argb[4] >> 3;
				210	uint8 g1 = src_argb[5] >> 3;
				211	uint8 r1 = src_argb[6] >> 3;
				212	uint8 a1 = src_argb[7] >> 7;
				213	(uint32)(dst_rgb) =
				214	b0 \| (g0 << 5) \| (r0 << 10) \| (a0 << 15) \|
				215	(b1 << 16) \| (g1 << 21) \| (r1 << 26) \| (a1 << 31);
				216	dst_rgb += 4;
				217	src_argb += 8;
				218	}
				219	if (width & 1) {
				220	uint8 b0 = src_argb[0] >> 3;
				221	uint8 g0 = src_argb[1] >> 3;
				222	uint8 r0 = src_argb[2] >> 3;
				223	uint8 a0 = src_argb[3] >> 7;
				224	(uint16)(dst_rgb) =
				225	b0 \| (g0 << 5) \| (r0 << 10) \| (a0 << 15);
				226	}
				227	}
				228
				229	void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
				230	int x;
				231	for (x = 0; x < width - 1; x += 2) {
				232	uint8 b0 = src_argb[0] >> 4;
				233	uint8 g0 = src_argb[1] >> 4;
				234	uint8 r0 = src_argb[2] >> 4;
				235	uint8 a0 = src_argb[3] >> 4;
				236	uint8 b1 = src_argb[4] >> 4;
				237	uint8 g1 = src_argb[5] >> 4;
				238	uint8 r1 = src_argb[6] >> 4;
				239	uint8 a1 = src_argb[7] >> 4;
				240	(uint32)(dst_rgb) =
				241	b0 \| (g0 << 4) \| (r0 << 8) \| (a0 << 12) \|
				242	(b1 << 16) \| (g1 << 20) \| (r1 << 24) \| (a1 << 28);
				243	dst_rgb += 4;
				244	src_argb += 8;
				245	}
				246	if (width & 1) {
				247	uint8 b0 = src_argb[0] >> 4;
				248	uint8 g0 = src_argb[1] >> 4;
				249	uint8 r0 = src_argb[2] >> 4;
				250	uint8 a0 = src_argb[3] >> 4;
				251	(uint16)(dst_rgb) =
				252	b0 \| (g0 << 4) \| (r0 << 8) \| (a0 << 12);
				253	}
				254	}
				255
				256	static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {
				257	return (66 * r + 129 * g + 25 * b + 0x1080) >> 8;
				258	}
				259
				260	static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {
				261	return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
				262	}
				263	static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {
				264	return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
				265	}
				266
				267	#define MAKEROWY(NAME, R, G, B, BPP) \
				268	void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
				269	int x; \
				270	for (x = 0; x < width; ++x) { \
				271	dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \
				272	src_argb0 += BPP; \
				273	dst_y += 1; \
				274	} \
				275	} \
				276	void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb, \
				277	uint8* dst_u, uint8* dst_v, int width) { \
				278	const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \
				279	int x; \
				280	for (x = 0; x < width - 1; x += 2) { \
				281	uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] + \
				282	src_rgb1[B] + src_rgb1[B + BPP]) >> 2; \
				283	uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] + \
				284	src_rgb1[G] + src_rgb1[G + BPP]) >> 2; \
				285	uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] + \
				286	src_rgb1[R] + src_rgb1[R + BPP]) >> 2; \
				287	dst_u[0] = RGBToU(ar, ag, ab); \
				288	dst_v[0] = RGBToV(ar, ag, ab); \
				289	src_rgb0 += BPP * 2; \
				290	src_rgb1 += BPP * 2; \
				291	dst_u += 1; \
				292	dst_v += 1; \
				293	} \
				294	if (width & 1) { \
				295	uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \
				296	uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \
				297	uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \
				298	dst_u[0] = RGBToU(ar, ag, ab); \
				299	dst_v[0] = RGBToV(ar, ag, ab); \
				300	} \
				301	}
				302
				303	MAKEROWY(ARGB, 2, 1, 0, 4)
				304	MAKEROWY(BGRA, 1, 2, 3, 4)
				305	MAKEROWY(ABGR, 0, 1, 2, 4)
				306	MAKEROWY(RGBA, 3, 2, 1, 4)
				307	MAKEROWY(RGB24, 2, 1, 0, 3)
				308	MAKEROWY(RAW, 0, 1, 2, 3)
				309	#undef MAKEROWY
				310
				311	// JPeg uses a variation on BT.601-1 full range
				312	// y = 0.29900 * r + 0.58700 * g + 0.11400 * b
				313	// u = -0.16874 * r - 0.33126 * g + 0.50000 * b + center
				314	// v = 0.50000 * r - 0.41869 * g - 0.08131 * b + center
				315	// BT.601 Mpeg range uses:
				316	// b 0.1016 * 255 = 25.908 = 25
				317	// g 0.5078 * 255 = 129.489 = 129
				318	// r 0.2578 * 255 = 65.739 = 66
				319	// JPeg 8 bit Y (not used):
				320	// b 0.11400 * 256 = 29.184 = 29
				321	// g 0.58700 * 256 = 150.272 = 150
				322	// r 0.29900 * 256 = 76.544 = 77
				323	// JPeg 7 bit Y:
				324	// b 0.11400 * 128 = 14.592 = 15
				325	// g 0.58700 * 128 = 75.136 = 75
				326	// r 0.29900 * 128 = 38.272 = 38
				327	// JPeg 8 bit U:
				328	// b 0.50000 * 255 = 127.5 = 127
				329	// g -0.33126 * 255 = -84.4713 = -84
				330	// r -0.16874 * 255 = -43.0287 = -43
				331	// JPeg 8 bit V:
				332	// b -0.08131 * 255 = -20.73405 = -20
				333	// g -0.41869 * 255 = -106.76595 = -107
				334	// r 0.50000 * 255 = 127.5 = 127
				335
				336	static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) {
				337	return (38 * r + 75 * g + 15 * b + 64) >> 7;
				338	}
				339
				340	static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) {
				341	return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
				342	}
				343	static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) {
				344	return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
				345	}
				346
				347	#define AVGB(a, b) (((a) + (b) + 1) >> 1)
				348
				349	#define MAKEROWYJ(NAME, R, G, B, BPP) \
				350	void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
				351	int x; \
				352	for (x = 0; x < width; ++x) { \
				353	dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \
				354	src_argb0 += BPP; \
				355	dst_y += 1; \
				356	} \
				357	} \
				358	void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb, \
				359	uint8* dst_u, uint8* dst_v, int width) { \
				360	const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \
				361	int x; \
				362	for (x = 0; x < width - 1; x += 2) { \
				363	uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \
				364	AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \
				365	uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \
				366	AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \
				367	uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \
				368	AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \
				369	dst_u[0] = RGBToUJ(ar, ag, ab); \
				370	dst_v[0] = RGBToVJ(ar, ag, ab); \
				371	src_rgb0 += BPP * 2; \
				372	src_rgb1 += BPP * 2; \
				373	dst_u += 1; \
				374	dst_v += 1; \
				375	} \
				376	if (width & 1) { \
				377	uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]); \
				378	uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]); \
				379	uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]); \
				380	dst_u[0] = RGBToUJ(ar, ag, ab); \
				381	dst_v[0] = RGBToVJ(ar, ag, ab); \
				382	} \
				383	}
				384
				385	MAKEROWYJ(ARGB, 2, 1, 0, 4)
				386	#undef MAKEROWYJ
				387
				388	void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
				389	int x;
				390	for (x = 0; x < width; ++x) {
				391	uint8 b = src_rgb565[0] & 0x1f;
				392	uint8 g = (src_rgb565[0] >> 5) \| ((src_rgb565[1] & 0x07) << 3);
				393	uint8 r = src_rgb565[1] >> 3;
				394	b = (b << 3) \| (b >> 2);
				395	g = (g << 2) \| (g >> 4);
				396	r = (r << 3) \| (r >> 2);
				397	dst_y[0] = RGBToY(r, g, b);
				398	src_rgb565 += 2;
				399	dst_y += 1;
				400	}
				401	}
				402
				403	void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) {
				404	int x;
				405	for (x = 0; x < width; ++x) {
				406	uint8 b = src_argb1555[0] & 0x1f;
				407	uint8 g = (src_argb1555[0] >> 5) \| ((src_argb1555[1] & 0x03) << 3);
				408	uint8 r = (src_argb1555[1] & 0x7c) >> 2;
				409	b = (b << 3) \| (b >> 2);
				410	g = (g << 3) \| (g >> 2);
				411	r = (r << 3) \| (r >> 2);
				412	dst_y[0] = RGBToY(r, g, b);
				413	src_argb1555 += 2;
				414	dst_y += 1;
				415	}
				416	}
				417
				418	void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {
				419	int x;
				420	for (x = 0; x < width; ++x) {
				421	uint8 b = src_argb4444[0] & 0x0f;
				422	uint8 g = src_argb4444[0] >> 4;
				423	uint8 r = src_argb4444[1] & 0x0f;
				424	b = (b << 4) \| b;
				425	g = (g << 4) \| g;
				426	r = (r << 4) \| r;
				427	dst_y[0] = RGBToY(r, g, b);
				428	src_argb4444 += 2;
				429	dst_y += 1;
				430	}
				431	}
				432
				433	void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
				434	uint8* dst_u, uint8* dst_v, int width) {
				435	const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565;
				436	int x;
				437	for (x = 0; x < width - 1; x += 2) {
				438	uint8 b0 = src_rgb565[0] & 0x1f;
				439	uint8 g0 = (src_rgb565[0] >> 5) \| ((src_rgb565[1] & 0x07) << 3);
				440	uint8 r0 = src_rgb565[1] >> 3;
				441	uint8 b1 = src_rgb565[2] & 0x1f;
				442	uint8 g1 = (src_rgb565[2] >> 5) \| ((src_rgb565[3] & 0x07) << 3);
				443	uint8 r1 = src_rgb565[3] >> 3;
				444	uint8 b2 = next_rgb565[0] & 0x1f;
				445	uint8 g2 = (next_rgb565[0] >> 5) \| ((next_rgb565[1] & 0x07) << 3);
				446	uint8 r2 = next_rgb565[1] >> 3;
				447	uint8 b3 = next_rgb565[2] & 0x1f;
				448	uint8 g3 = (next_rgb565[2] >> 5) \| ((next_rgb565[3] & 0x07) << 3);
				449	uint8 r3 = next_rgb565[3] >> 3;
				450	uint8 b = (b0 + b1 + b2 + b3); // 565 * 4 = 787.
				451	uint8 g = (g0 + g1 + g2 + g3);
				452	uint8 r = (r0 + r1 + r2 + r3);
				453	b = (b << 1) \| (b >> 6); // 787 -> 888.
				454	r = (r << 1) \| (r >> 6);
				455	dst_u[0] = RGBToU(r, g, b);
				456	dst_v[0] = RGBToV(r, g, b);
				457	src_rgb565 += 4;
				458	next_rgb565 += 4;
				459	dst_u += 1;
				460	dst_v += 1;
				461	}
				462	if (width & 1) {
				463	uint8 b0 = src_rgb565[0] & 0x1f;
				464	uint8 g0 = (src_rgb565[0] >> 5) \| ((src_rgb565[1] & 0x07) << 3);
				465	uint8 r0 = src_rgb565[1] >> 3;
				466	uint8 b2 = next_rgb565[0] & 0x1f;
				467	uint8 g2 = (next_rgb565[0] >> 5) \| ((next_rgb565[1] & 0x07) << 3);
				468	uint8 r2 = next_rgb565[1] >> 3;
				469	uint8 b = (b0 + b2); // 565 * 2 = 676.
				470	uint8 g = (g0 + g2);
				471	uint8 r = (r0 + r2);
				472	b = (b << 2) \| (b >> 4); // 676 -> 888
				473	g = (g << 1) \| (g >> 6);
				474	r = (r << 2) \| (r >> 4);
				475	dst_u[0] = RGBToU(r, g, b);
				476	dst_v[0] = RGBToV(r, g, b);
				477	}
				478	}
				479
				480	void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
				481	uint8* dst_u, uint8* dst_v, int width) {
				482	const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555;
				483	int x;
				484	for (x = 0; x < width - 1; x += 2) {
				485	uint8 b0 = src_argb1555[0] & 0x1f;
				486	uint8 g0 = (src_argb1555[0] >> 5) \| ((src_argb1555[1] & 0x03) << 3);
				487	uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
				488	uint8 b1 = src_argb1555[2] & 0x1f;
				489	uint8 g1 = (src_argb1555[2] >> 5) \| ((src_argb1555[3] & 0x03) << 3);
				490	uint8 r1 = (src_argb1555[3] & 0x7c) >> 2;
				491	uint8 b2 = next_argb1555[0] & 0x1f;
				492	uint8 g2 = (next_argb1555[0] >> 5) \| ((next_argb1555[1] & 0x03) << 3);
				493	uint8 r2 = (next_argb1555[1] & 0x7c) >> 2;
				494	uint8 b3 = next_argb1555[2] & 0x1f;
				495	uint8 g3 = (next_argb1555[2] >> 5) \| ((next_argb1555[3] & 0x03) << 3);
				496	uint8 r3 = (next_argb1555[3] & 0x7c) >> 2;
				497	uint8 b = (b0 + b1 + b2 + b3); // 555 * 4 = 777.
				498	uint8 g = (g0 + g1 + g2 + g3);
				499	uint8 r = (r0 + r1 + r2 + r3);
				500	b = (b << 1) \| (b >> 6); // 777 -> 888.
				501	g = (g << 1) \| (g >> 6);
				502	r = (r << 1) \| (r >> 6);
				503	dst_u[0] = RGBToU(r, g, b);
				504	dst_v[0] = RGBToV(r, g, b);
				505	src_argb1555 += 4;
				506	next_argb1555 += 4;
				507	dst_u += 1;
				508	dst_v += 1;
				509	}
				510	if (width & 1) {
				511	uint8 b0 = src_argb1555[0] & 0x1f;
				512	uint8 g0 = (src_argb1555[0] >> 5) \| ((src_argb1555[1] & 0x03) << 3);
				513	uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
				514	uint8 b2 = next_argb1555[0] & 0x1f;
				515	uint8 g2 = (next_argb1555[0] >> 5) \| ((next_argb1555[1] & 0x03) << 3);
				516	uint8 r2 = next_argb1555[1] >> 3;
				517	uint8 b = (b0 + b2); // 555 * 2 = 666.
				518	uint8 g = (g0 + g2);
				519	uint8 r = (r0 + r2);
				520	b = (b << 2) \| (b >> 4); // 666 -> 888.
				521	g = (g << 2) \| (g >> 4);
				522	r = (r << 2) \| (r >> 4);
				523	dst_u[0] = RGBToU(r, g, b);
				524	dst_v[0] = RGBToV(r, g, b);
				525	}
				526	}
				527
				528	void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
				529	uint8* dst_u, uint8* dst_v, int width) {
				530	const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444;
				531	int x;
				532	for (x = 0; x < width - 1; x += 2) {
				533	uint8 b0 = src_argb4444[0] & 0x0f;
				534	uint8 g0 = src_argb4444[0] >> 4;
				535	uint8 r0 = src_argb4444[1] & 0x0f;
				536	uint8 b1 = src_argb4444[2] & 0x0f;
				537	uint8 g1 = src_argb4444[2] >> 4;
				538	uint8 r1 = src_argb4444[3] & 0x0f;
				539	uint8 b2 = next_argb4444[0] & 0x0f;
				540	uint8 g2 = next_argb4444[0] >> 4;
				541	uint8 r2 = next_argb4444[1] & 0x0f;
				542	uint8 b3 = next_argb4444[2] & 0x0f;
				543	uint8 g3 = next_argb4444[2] >> 4;
				544	uint8 r3 = next_argb4444[3] & 0x0f;
				545	uint8 b = (b0 + b1 + b2 + b3); // 444 * 4 = 666.
				546	uint8 g = (g0 + g1 + g2 + g3);
				547	uint8 r = (r0 + r1 + r2 + r3);
				548	b = (b << 2) \| (b >> 4); // 666 -> 888.
				549	g = (g << 2) \| (g >> 4);
				550	r = (r << 2) \| (r >> 4);
				551	dst_u[0] = RGBToU(r, g, b);
				552	dst_v[0] = RGBToV(r, g, b);
				553	src_argb4444 += 4;
				554	next_argb4444 += 4;
				555	dst_u += 1;
				556	dst_v += 1;
				557	}
				558	if (width & 1) {
				559	uint8 b0 = src_argb4444[0] & 0x0f;
				560	uint8 g0 = src_argb4444[0] >> 4;
				561	uint8 r0 = src_argb4444[1] & 0x0f;
				562	uint8 b2 = next_argb4444[0] & 0x0f;
				563	uint8 g2 = next_argb4444[0] >> 4;
				564	uint8 r2 = next_argb4444[1] & 0x0f;
				565	uint8 b = (b0 + b2); // 444 * 2 = 555.
				566	uint8 g = (g0 + g2);
				567	uint8 r = (r0 + r2);
				568	b = (b << 3) \| (b >> 2); // 555 -> 888.
				569	g = (g << 3) \| (g >> 2);
				570	r = (r << 3) \| (r >> 2);
				571	dst_u[0] = RGBToU(r, g, b);
				572	dst_v[0] = RGBToV(r, g, b);
				573	}
				574	}
				575
				576	void ARGBToUV444Row_C(const uint8* src_argb,
				577	uint8* dst_u, uint8* dst_v, int width) {
				578	int x;
				579	for (x = 0; x < width; ++x) {
				580	uint8 ab = src_argb[0];
				581	uint8 ag = src_argb[1];
				582	uint8 ar = src_argb[2];
				583	dst_u[0] = RGBToU(ar, ag, ab);
				584	dst_v[0] = RGBToV(ar, ag, ab);
				585	src_argb += 4;
				586	dst_u += 1;
				587	dst_v += 1;
				588	}
				589	}
				590
				591	void ARGBToUV422Row_C(const uint8* src_argb,
				592	uint8* dst_u, uint8* dst_v, int width) {
				593	int x;
				594	for (x = 0; x < width - 1; x += 2) {
				595	uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
				596	uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
				597	uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
				598	dst_u[0] = RGBToU(ar, ag, ab);
				599	dst_v[0] = RGBToV(ar, ag, ab);
				600	src_argb += 8;
				601	dst_u += 1;
				602	dst_v += 1;
				603	}
				604	if (width & 1) {
				605	uint8 ab = src_argb[0];
				606	uint8 ag = src_argb[1];
				607	uint8 ar = src_argb[2];
				608	dst_u[0] = RGBToU(ar, ag, ab);
				609	dst_v[0] = RGBToV(ar, ag, ab);
				610	}
				611	}
				612
				613	void ARGBToUV411Row_C(const uint8* src_argb,
				614	uint8* dst_u, uint8* dst_v, int width) {
				615	int x;
				616	for (x = 0; x < width - 3; x += 4) {
				617	uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2;
				618	uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2;
				619	uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2;
				620	dst_u[0] = RGBToU(ar, ag, ab);
				621	dst_v[0] = RGBToV(ar, ag, ab);
				622	src_argb += 16;
				623	dst_u += 1;
				624	dst_v += 1;
				625	}
				626	if ((width & 3) == 3) {
				627	uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8]) / 3;
				628	uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9]) / 3;
				629	uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10]) / 3;
				630	dst_u[0] = RGBToU(ar, ag, ab);
				631	dst_v[0] = RGBToV(ar, ag, ab);
				632	} else if ((width & 3) == 2) {
				633	uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
				634	uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
				635	uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
				636	dst_u[0] = RGBToU(ar, ag, ab);
				637	dst_v[0] = RGBToV(ar, ag, ab);
				638	} else if ((width & 3) == 1) {
				639	uint8 ab = src_argb[0];
				640	uint8 ag = src_argb[1];
				641	uint8 ar = src_argb[2];
				642	dst_u[0] = RGBToU(ar, ag, ab);
				643	dst_v[0] = RGBToV(ar, ag, ab);
				644	}
				645	}
				646
				647	void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
				648	int x;
				649	for (x = 0; x < width; ++x) {
				650	uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
				651	dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
				652	dst_argb[3] = src_argb[3];
				653	dst_argb += 4;
				654	src_argb += 4;
				655	}
				656	}
				657
				658	// Convert a row of image to Sepia tone.
				659	void ARGBSepiaRow_C(uint8* dst_argb, int width) {
				660	int x;
				661	for (x = 0; x < width; ++x) {
				662	int b = dst_argb[0];
				663	int g = dst_argb[1];
				664	int r = dst_argb[2];
				665	int sb = (b * 17 + g * 68 + r * 35) >> 7;
				666	int sg = (b * 22 + g * 88 + r * 45) >> 7;
				667	int sr = (b * 24 + g * 98 + r * 50) >> 7;
				668	// b does not over flow. a is preserved from original.
				669	dst_argb[0] = sb;
				670	dst_argb[1] = clamp255(sg);
				671	dst_argb[2] = clamp255(sr);
				672	dst_argb += 4;
				673	}
				674	}
				675
				676	// Apply color matrix to a row of image. Matrix is signed.
				677	// TODO(fbarchard): Consider adding rounding (+32).
				678	void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
				679	const int8* matrix_argb, int width) {
				680	int x;
				681	for (x = 0; x < width; ++x) {
				682	int b = src_argb[0];
				683	int g = src_argb[1];
				684	int r = src_argb[2];
				685	int a = src_argb[3];
				686	int sb = (b * matrix_argb[0] + g * matrix_argb[1] +
				687	r * matrix_argb[2] + a * matrix_argb[3]) >> 6;
				688	int sg = (b * matrix_argb[4] + g * matrix_argb[5] +
				689	r * matrix_argb[6] + a * matrix_argb[7]) >> 6;
				690	int sr = (b * matrix_argb[8] + g * matrix_argb[9] +
				691	r * matrix_argb[10] + a * matrix_argb[11]) >> 6;
				692	int sa = (b * matrix_argb[12] + g * matrix_argb[13] +
				693	r * matrix_argb[14] + a * matrix_argb[15]) >> 6;
				694	dst_argb[0] = Clamp(sb);
				695	dst_argb[1] = Clamp(sg);
				696	dst_argb[2] = Clamp(sr);
				697	dst_argb[3] = Clamp(sa);
				698	src_argb += 4;
				699	dst_argb += 4;
				700	}
				701	}
				702
				703	// Apply color table to a row of image.
				704	void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
				705	int x;
				706	for (x = 0; x < width; ++x) {
				707	int b = dst_argb[0];
				708	int g = dst_argb[1];
				709	int r = dst_argb[2];
				710	int a = dst_argb[3];
				711	dst_argb[0] = table_argb[b * 4 + 0];
				712	dst_argb[1] = table_argb[g * 4 + 1];
				713	dst_argb[2] = table_argb[r * 4 + 2];
				714	dst_argb[3] = table_argb[a * 4 + 3];
				715	dst_argb += 4;
				716	}
				717	}
				718
				719	// Apply color table to a row of image.
				720	void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
				721	int x;
				722	for (x = 0; x < width; ++x) {
				723	int b = dst_argb[0];
				724	int g = dst_argb[1];
				725	int r = dst_argb[2];
				726	dst_argb[0] = table_argb[b * 4 + 0];
				727	dst_argb[1] = table_argb[g * 4 + 1];
				728	dst_argb[2] = table_argb[r * 4 + 2];
				729	dst_argb += 4;
				730	}
				731	}
				732
				733	void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
				734	int interval_offset, int width) {
				735	int x;
				736	for (x = 0; x < width; ++x) {
				737	int b = dst_argb[0];
				738	int g = dst_argb[1];
				739	int r = dst_argb[2];
				740	dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
				741	dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
				742	dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
				743	dst_argb += 4;
				744	}
				745	}
				746
				747	#define REPEAT8(v) (v) \| ((v) << 8)
				748	#define SHADE(f, v) v * f >> 24
				749
				750	void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
				751	uint32 value) {
				752	const uint32 b_scale = REPEAT8(value & 0xff);
				753	const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
				754	const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
				755	const uint32 a_scale = REPEAT8(value >> 24);
				756
				757	int i;
				758	for (i = 0; i < width; ++i) {
				759	const uint32 b = REPEAT8(src_argb[0]);
				760	const uint32 g = REPEAT8(src_argb[1]);
				761	const uint32 r = REPEAT8(src_argb[2]);
				762	const uint32 a = REPEAT8(src_argb[3]);
				763	dst_argb[0] = SHADE(b, b_scale);
				764	dst_argb[1] = SHADE(g, g_scale);
				765	dst_argb[2] = SHADE(r, r_scale);
				766	dst_argb[3] = SHADE(a, a_scale);
				767	src_argb += 4;
				768	dst_argb += 4;
				769	}
				770	}
				771	#undef REPEAT8
				772	#undef SHADE
				773
				774	#define REPEAT8(v) (v) \| ((v) << 8)
				775	#define SHADE(f, v) v * f >> 16
				776
				777	void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
				778	uint8* dst_argb, int width) {
				779	int i;
				780	for (i = 0; i < width; ++i) {
				781	const uint32 b = REPEAT8(src_argb0[0]);
				782	const uint32 g = REPEAT8(src_argb0[1]);
				783	const uint32 r = REPEAT8(src_argb0[2]);
				784	const uint32 a = REPEAT8(src_argb0[3]);
				785	const uint32 b_scale = src_argb1[0];
				786	const uint32 g_scale = src_argb1[1];
				787	const uint32 r_scale = src_argb1[2];
				788	const uint32 a_scale = src_argb1[3];
				789	dst_argb[0] = SHADE(b, b_scale);
				790	dst_argb[1] = SHADE(g, g_scale);
				791	dst_argb[2] = SHADE(r, r_scale);
				792	dst_argb[3] = SHADE(a, a_scale);
				793	src_argb0 += 4;
				794	src_argb1 += 4;
				795	dst_argb += 4;
				796	}
				797	}
				798	#undef REPEAT8
				799	#undef SHADE
				800
				801	#define SHADE(f, v) clamp255(v + f)
				802
				803	void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,
				804	uint8* dst_argb, int width) {
				805	int i;
				806	for (i = 0; i < width; ++i) {
				807	const int b = src_argb0[0];
				808	const int g = src_argb0[1];
				809	const int r = src_argb0[2];
				810	const int a = src_argb0[3];
				811	const int b_add = src_argb1[0];
				812	const int g_add = src_argb1[1];
				813	const int r_add = src_argb1[2];
				814	const int a_add = src_argb1[3];
				815	dst_argb[0] = SHADE(b, b_add);
				816	dst_argb[1] = SHADE(g, g_add);
				817	dst_argb[2] = SHADE(r, r_add);
				818	dst_argb[3] = SHADE(a, a_add);
				819	src_argb0 += 4;
				820	src_argb1 += 4;
				821	dst_argb += 4;
				822	}
				823	}
				824	#undef SHADE
				825
				826	#define SHADE(f, v) clamp0(f - v)
				827
				828	void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,
				829	uint8* dst_argb, int width) {
				830	int i;
				831	for (i = 0; i < width; ++i) {
				832	const int b = src_argb0[0];
				833	const int g = src_argb0[1];
				834	const int r = src_argb0[2];
				835	const int a = src_argb0[3];
				836	const int b_sub = src_argb1[0];
				837	const int g_sub = src_argb1[1];
				838	const int r_sub = src_argb1[2];
				839	const int a_sub = src_argb1[3];
				840	dst_argb[0] = SHADE(b, b_sub);
				841	dst_argb[1] = SHADE(g, g_sub);
				842	dst_argb[2] = SHADE(r, r_sub);
				843	dst_argb[3] = SHADE(a, a_sub);
				844	src_argb0 += 4;
				845	src_argb1 += 4;
				846	dst_argb += 4;
				847	}
				848	}
				849	#undef SHADE
				850
				851	// Sobel functions which mimics SSSE3.
				852	void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
				853	uint8* dst_sobelx, int width) {
				854	int i;
				855	for (i = 0; i < width; ++i) {
				856	int a = src_y0[i];
				857	int b = src_y1[i];
				858	int c = src_y2[i];
				859	int a_sub = src_y0[i + 2];
				860	int b_sub = src_y1[i + 2];
				861	int c_sub = src_y2[i + 2];
				862	int a_diff = a - a_sub;
				863	int b_diff = b - b_sub;
				864	int c_diff = c - c_sub;
				865	int sobel = Abs(a_diff + b_diff * 2 + c_diff);
				866	dst_sobelx[i] = (uint8)(clamp255(sobel));
				867	}
				868	}
				869
				870	void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
				871	uint8* dst_sobely, int width) {
				872	int i;
				873	for (i = 0; i < width; ++i) {
				874	int a = src_y0[i + 0];
				875	int b = src_y0[i + 1];
				876	int c = src_y0[i + 2];
				877	int a_sub = src_y1[i + 0];
				878	int b_sub = src_y1[i + 1];
				879	int c_sub = src_y1[i + 2];
				880	int a_diff = a - a_sub;
				881	int b_diff = b - b_sub;
				882	int c_diff = c - c_sub;
				883	int sobel = Abs(a_diff + b_diff * 2 + c_diff);
				884	dst_sobely[i] = (uint8)(clamp255(sobel));
				885	}
				886	}
				887
				888	void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
				889	uint8* dst_argb, int width) {
				890	int i;
				891	for (i = 0; i < width; ++i) {
				892	int r = src_sobelx[i];
				893	int b = src_sobely[i];
				894	int s = clamp255(r + b);
				895	dst_argb[0] = (uint8)(s);
				896	dst_argb[1] = (uint8)(s);
				897	dst_argb[2] = (uint8)(s);
				898	dst_argb[3] = (uint8)(255u);
				899	dst_argb += 4;
				900	}
				901	}
				902
				903	void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
				904	uint8* dst_y, int width) {
				905	int i;
				906	for (i = 0; i < width; ++i) {
				907	int r = src_sobelx[i];
				908	int b = src_sobely[i];
				909	int s = clamp255(r + b);
				910	dst_y[i] = (uint8)(s);
				911	}
				912	}
				913
				914	void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
				915	uint8* dst_argb, int width) {
				916	int i;
				917	for (i = 0; i < width; ++i) {
				918	int r = src_sobelx[i];
				919	int b = src_sobely[i];
				920	int g = clamp255(r + b);
				921	dst_argb[0] = (uint8)(b);
				922	dst_argb[1] = (uint8)(g);
				923	dst_argb[2] = (uint8)(r);
				924	dst_argb[3] = (uint8)(255u);
				925	dst_argb += 4;
				926	}
				927	}
				928
				929	void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
				930	// Copy a Y to RGB.
				931	int x;
				932	for (x = 0; x < width; ++x) {
				933	uint8 y = src_y[0];
				934	dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
				935	dst_argb[3] = 255u;
				936	dst_argb += 4;
				937	++src_y;
				938	}
				939	}
				940
				941	// C reference code that mimics the YUV assembly.
				942
				943	#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
				944
				945	#define UB 127 /* min(63,(int8)(2.018 * 64)) */
				946	#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
				947	#define UR 0
				948
				949	#define VB 0
				950	#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
				951	#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
				952
				953	// Bias
				954	#define BB UB * 128 + VB * 128
				955	#define BG UG * 128 + VG * 128
				956	#define BR UR * 128 + VR * 128
				957
				958	static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
				959	uint8* b, uint8* g, uint8* r) {
				960	int32 y1 = ((int32)(y) - 16) * YG;
				961	b = Clamp((int32)((u UB + v * VB) - (BB) + y1) >> 6);
				962	g = Clamp((int32)((u UG + v * VG) - (BG) + y1) >> 6);
				963	r = Clamp((int32)((u UR + v * VR) - (BR) + y1) >> 6);
				964	}
				965
				966	#if !defined(LIBYUV_DISABLE_NEON) && \
				967	(defined(__ARM_NEON__) \|\| defined(LIBYUV_NEON))
				968	// C mimic assembly.
				969	// TODO(fbarchard): Remove subsampling from Neon.
				970	void I444ToARGBRow_C(const uint8* src_y,
				971	const uint8* src_u,
				972	const uint8* src_v,
				973	uint8* rgb_buf,
				974	int width) {
				975	int x;
				976	for (x = 0; x < width - 1; x += 2) {
				977	uint8 u = (src_u[0] + src_u[1] + 1) >> 1;
				978	uint8 v = (src_v[0] + src_v[1] + 1) >> 1;
				979	YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
				980	rgb_buf[3] = 255;
				981	YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
				982	rgb_buf[7] = 255;
				983	src_y += 2;
				984	src_u += 2;
				985	src_v += 2;
				986	rgb_buf += 8; // Advance 2 pixels.
				987	}
				988	if (width & 1) {
				989	YuvPixel(src_y[0], src_u[0], src_v[0],
				990	rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
				991	}
				992	}
				993	#else
				994	void I444ToARGBRow_C(const uint8* src_y,
				995	const uint8* src_u,
				996	const uint8* src_v,
				997	uint8* rgb_buf,
				998	int width) {
				999	int x;
				1000	for (x = 0; x < width; ++x) {
				1001	YuvPixel(src_y[0], src_u[0], src_v[0],
				1002	rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
				1003	rgb_buf[3] = 255;
				1004	src_y += 1;
				1005	src_u += 1;
				1006	src_v += 1;
				1007	rgb_buf += 4; // Advance 1 pixel.
				1008	}
				1009	}
				1010	#endif
				1011	// Also used for 420
				1012	void I422ToARGBRow_C(const uint8* src_y,
				1013	const uint8* src_u,
				1014	const uint8* src_v,
				1015	uint8* rgb_buf,
				1016	int width) {
				1017	int x;
				1018	for (x = 0; x < width - 1; x += 2) {
				1019	YuvPixel(src_y[0], src_u[0], src_v[0],
				1020	rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
				1021	rgb_buf[3] = 255;
				1022	YuvPixel(src_y[1], src_u[0], src_v[0],
				1023	rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
				1024	rgb_buf[7] = 255;
				1025	src_y += 2;
				1026	src_u += 1;
				1027	src_v += 1;
				1028	rgb_buf += 8; // Advance 2 pixels.
				1029	}
				1030	if (width & 1) {
				1031	YuvPixel(src_y[0], src_u[0], src_v[0],
				1032	rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
				1033	rgb_buf[3] = 255;
				1034	}
				1035	}
				1036
				1037	void I422ToRGB24Row_C(const uint8* src_y,
				1038	const uint8* src_u,
				1039	const uint8* src_v,
				1040	uint8* rgb_buf,
				1041	int width) {
				1042	int x;
				1043	for (x = 0; x < width - 1; x += 2) {
				1044	YuvPixel(src_y[0], src_u[0], src_v[0],
				1045	rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
				1046	YuvPixel(src_y[1], src_u[0], src_v[0],
				1047	rgb_buf + 3, rgb_buf + 4, rgb_buf + 5);
				1048	src_y += 2;
				1049	src_u += 1;
				1050	src_v += 1;
				1051	rgb_buf += 6; // Advance 2 pixels.
				1052	}
				1053	if (width & 1) {
				1054	YuvPixel(src_y[0], src_u[0], src_v[0],
				1055	rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
				1056	}
				1057	}
				1058
				1059	void I422ToRAWRow_C(const uint8* src_y,
				1060	const uint8* src_u,
				1061	const uint8* src_v,
				1062	uint8* rgb_buf,
				1063	int width) {
				1064	int x;
				1065	for (x = 0; x < width - 1; x += 2) {
				1066	YuvPixel(src_y[0], src_u[0], src_v[0],
				1067	rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
				1068	YuvPixel(src_y[1], src_u[0], src_v[0],
				1069	rgb_buf + 5, rgb_buf + 4, rgb_buf + 3);
				1070	src_y += 2;
				1071	src_u += 1;
				1072	src_v += 1;
				1073	rgb_buf += 6; // Advance 2 pixels.
				1074	}
				1075	if (width & 1) {
				1076	YuvPixel(src_y[0], src_u[0], src_v[0],
				1077	rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
				1078	}
				1079	}
				1080
				1081	void I422ToARGB4444Row_C(const uint8* src_y,
				1082	const uint8* src_u,
				1083	const uint8* src_v,
				1084	uint8* dst_argb4444,
				1085	int width) {
				1086	uint8 b0;
				1087	uint8 g0;
				1088	uint8 r0;
				1089	uint8 b1;
				1090	uint8 g1;
				1091	uint8 r1;
				1092	int x;
				1093	for (x = 0; x < width - 1; x += 2) {
				1094	YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
				1095	YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
				1096	b0 = b0 >> 4;
				1097	g0 = g0 >> 4;
				1098	r0 = r0 >> 4;
				1099	b1 = b1 >> 4;
				1100	g1 = g1 >> 4;
				1101	r1 = r1 >> 4;
				1102	(uint32)(dst_argb4444) = b0 \| (g0 << 4) \| (r0 << 8) \|
				1103	(b1 << 16) \| (g1 << 20) \| (r1 << 24) \| 0xf000f000;
				1104	src_y += 2;
				1105	src_u += 1;
				1106	src_v += 1;
				1107	dst_argb4444 += 4; // Advance 2 pixels.
				1108	}
				1109	if (width & 1) {
				1110	YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
				1111	b0 = b0 >> 4;
				1112	g0 = g0 >> 4;
				1113	r0 = r0 >> 4;
				1114	(uint16)(dst_argb4444) = b0 \| (g0 << 4) \| (r0 << 8) \|
				1115	0xf000;
				1116	}
				1117	}
				1118
				1119	void I422ToARGB1555Row_C(const uint8* src_y,
				1120	const uint8* src_u,
				1121	const uint8* src_v,
				1122	uint8* dst_argb1555,
				1123	int width) {
				1124	uint8 b0;
				1125	uint8 g0;
				1126	uint8 r0;
				1127	uint8 b1;
				1128	uint8 g1;
				1129	uint8 r1;
				1130	int x;
				1131	for (x = 0; x < width - 1; x += 2) {
				1132	YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
				1133	YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
				1134	b0 = b0 >> 3;
				1135	g0 = g0 >> 3;
				1136	r0 = r0 >> 3;
				1137	b1 = b1 >> 3;
				1138	g1 = g1 >> 3;
				1139	r1 = r1 >> 3;
				1140	(uint32)(dst_argb1555) = b0 \| (g0 << 5) \| (r0 << 10) \|
				1141	(b1 << 16) \| (g1 << 21) \| (r1 << 26) \| 0x80008000;
				1142	src_y += 2;
				1143	src_u += 1;
				1144	src_v += 1;
				1145	dst_argb1555 += 4; // Advance 2 pixels.
				1146	}
				1147	if (width & 1) {
				1148	YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
				1149	b0 = b0 >> 3;
				1150	g0 = g0 >> 3;
				1151	r0 = r0 >> 3;
				1152	(uint16)(dst_argb1555) = b0 \| (g0 << 5) \| (r0 << 10) \|
				1153	0x8000;
				1154	}
				1155	}
				1156
				1157	void I422ToRGB565Row_C(const uint8* src_y,
				1158	const uint8* src_u,
				1159	const uint8* src_v,
				1160	uint8* dst_rgb565,
				1161	int width) {
				1162	uint8 b0;
				1163	uint8 g0;
				1164	uint8 r0;
				1165	uint8 b1;
				1166	uint8 g1;
				1167	uint8 r1;
				1168	int x;
				1169	for (x = 0; x < width - 1; x += 2) {
				1170	YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
				1171	YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
				1172	b0 = b0 >> 3;
				1173	g0 = g0 >> 2;
				1174	r0 = r0 >> 3;
				1175	b1 = b1 >> 3;
				1176	g1 = g1 >> 2;
				1177	r1 = r1 >> 3;
				1178	(uint32)(dst_rgb565) = b0 \| (g0 << 5) \| (r0 << 11) \|
				1179	(b1 << 16) \| (g1 << 21) \| (r1 << 27);
				1180	src_y += 2;
				1181	src_u += 1;
				1182	src_v += 1;
				1183	dst_rgb565 += 4; // Advance 2 pixels.
				1184	}
				1185	if (width & 1) {
				1186	YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
				1187	b0 = b0 >> 3;
				1188	g0 = g0 >> 2;
				1189	r0 = r0 >> 3;
				1190	(uint16)(dst_rgb565) = b0 \| (g0 << 5) \| (r0 << 11);
				1191	}
				1192	}
				1193
				1194	void I411ToARGBRow_C(const uint8* src_y,
				1195	const uint8* src_u,
				1196	const uint8* src_v,
				1197	uint8* rgb_buf,
				1198	int width) {
				1199	int x;
				1200	for (x = 0; x < width - 3; x += 4) {
				1201	YuvPixel(src_y[0], src_u[0], src_v[0],
				1202	rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
				1203	rgb_buf[3] = 255;
				1204	YuvPixel(src_y[1], src_u[0], src_v[0],
				1205	rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
				1206	rgb_buf[7] = 255;
				1207	YuvPixel(src_y[2], src_u[0], src_v[0],
				1208	rgb_buf + 8, rgb_buf + 9, rgb_buf + 10);
				1209	rgb_buf[11] = 255;
				1210	YuvPixel(src_y[3], src_u[0], src_v[0],
				1211	rgb_buf + 12, rgb_buf + 13, rgb_buf + 14);
				1212	rgb_buf[15] = 255;
				1213	src_y += 4;
				1214	src_u += 1;
				1215	src_v += 1;
				1216	rgb_buf += 16; // Advance 4 pixels.
				1217	}
				1218	if (width & 2) {
				1219	YuvPixel(src_y[0], src_u[0], src_v[0],
				1220	rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
				1221	rgb_buf[3] = 255;
				1222	YuvPixel(src_y[1], src_u[0], src_v[0],
				1223	rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
				1224	rgb_buf[7] = 255;
				1225	src_y += 2;
				1226	rgb_buf += 8; // Advance 2 pixels.
				1227	}
				1228	if (width & 1) {
				1229	YuvPixel(src_y[0], src_u[0], src_v[0],
				1230	rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
				1231	rgb_buf[3] = 255;
				1232	}
				1233	}
				1234
				1235	void NV12ToARGBRow_C(const uint8* src_y,
				1236	const uint8* usrc_v,
				1237	uint8* rgb_buf,
				1238	int width) {
				1239	int x;
				1240	for (x = 0; x < width - 1; x += 2) {
				1241	YuvPixel(src_y[0], usrc_v[0], usrc_v[1],
				1242	rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
				1243	rgb_buf[3] = 255;
				1244	YuvPixel(src_y[1], usrc_v[0], usrc_v[1],
				1245	rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
				1246	rgb_buf[7] = 255;
				1247	src_y += 2;
				1248	usrc_v += 2;
				1249	rgb_buf += 8; // Advance 2 pixels.
				1250	}
				1251	if (width & 1) {
				1252	YuvPixel(src_y[0], usrc_v[0], usrc_v[1],
				1253	rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
				1254	rgb_buf[3] = 255;
				1255	}
				1256	}
				1257
				1258	void NV21ToARGBRow_C(const uint8* src_y,
				1259	const uint8* src_vu,
				1260	uint8* rgb_buf,
				1261	int width) {
				1262	int x;
				1263	for (x = 0; x < width - 1; x += 2) {
				1264	YuvPixel(src_y[0], src_vu[1], src_vu[0],
				1265	rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
				1266	rgb_buf[3] = 255;
				1267
				1268	YuvPixel(src_y[1], src_vu[1], src_vu[0],
				1269	rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
				1270	rgb_buf[7] = 255;
				1271
				1272	src_y += 2;
				1273	src_vu += 2;
				1274	rgb_buf += 8; // Advance 2 pixels.
				1275	}
				1276	if (width & 1) {
				1277	YuvPixel(src_y[0], src_vu[1], src_vu[0],
				1278	rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
				1279	rgb_buf[3] = 255;
				1280	}
				1281	}
				1282
				1283	void NV12ToRGB565Row_C(const uint8* src_y,
				1284	const uint8* usrc_v,
				1285	uint8* dst_rgb565,
				1286	int width) {
				1287	uint8 b0;
				1288	uint8 g0;
				1289	uint8 r0;
				1290	uint8 b1;
				1291	uint8 g1;
				1292	uint8 r1;
				1293	int x;
				1294	for (x = 0; x < width - 1; x += 2) {
				1295	YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0);
				1296	YuvPixel(src_y[1], usrc_v[0], usrc_v[1], &b1, &g1, &r1);
				1297	b0 = b0 >> 3;
				1298	g0 = g0 >> 2;
				1299	r0 = r0 >> 3;
				1300	b1 = b1 >> 3;
				1301	g1 = g1 >> 2;
				1302	r1 = r1 >> 3;
				1303	(uint32)(dst_rgb565) = b0 \| (g0 << 5) \| (r0 << 11) \|
				1304	(b1 << 16) \| (g1 << 21) \| (r1 << 27);
				1305	src_y += 2;
				1306	usrc_v += 2;
				1307	dst_rgb565 += 4; // Advance 2 pixels.
				1308	}
				1309	if (width & 1) {
				1310	YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0);
				1311	b0 = b0 >> 3;
				1312	g0 = g0 >> 2;
				1313	r0 = r0 >> 3;
				1314	(uint16)(dst_rgb565) = b0 \| (g0 << 5) \| (r0 << 11);
				1315	}
				1316	}
				1317
				1318	void NV21ToRGB565Row_C(const uint8* src_y,
				1319	const uint8* vsrc_u,
				1320	uint8* dst_rgb565,
				1321	int width) {
				1322	uint8 b0;
				1323	uint8 g0;
				1324	uint8 r0;
				1325	uint8 b1;
				1326	uint8 g1;
				1327	uint8 r1;
				1328	int x;
				1329	for (x = 0; x < width - 1; x += 2) {
				1330	YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
				1331	YuvPixel(src_y[1], vsrc_u[1], vsrc_u[0], &b1, &g1, &r1);
				1332	b0 = b0 >> 3;
				1333	g0 = g0 >> 2;
				1334	r0 = r0 >> 3;
				1335	b1 = b1 >> 3;
				1336	g1 = g1 >> 2;
				1337	r1 = r1 >> 3;
				1338	(uint32)(dst_rgb565) = b0 \| (g0 << 5) \| (r0 << 11) \|
				1339	(b1 << 16) \| (g1 << 21) \| (r1 << 27);
				1340	src_y += 2;
				1341	vsrc_u += 2;
				1342	dst_rgb565 += 4; // Advance 2 pixels.
				1343	}
				1344	if (width & 1) {
				1345	YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
				1346	b0 = b0 >> 3;
				1347	g0 = g0 >> 2;
				1348	r0 = r0 >> 3;
				1349	(uint16)(dst_rgb565) = b0 \| (g0 << 5) \| (r0 << 11);
				1350	}
				1351	}
				1352
				1353	void YUY2ToARGBRow_C(const uint8* src_yuy2,
				1354	uint8* rgb_buf,
				1355	int width) {
				1356	int x;
				1357	for (x = 0; x < width - 1; x += 2) {
				1358	YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
				1359	rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
				1360	rgb_buf[3] = 255;
				1361	YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3],
				1362	rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
				1363	rgb_buf[7] = 255;
				1364	src_yuy2 += 4;
				1365	rgb_buf += 8; // Advance 2 pixels.
				1366	}
				1367	if (width & 1) {
				1368	YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
				1369	rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
				1370	rgb_buf[3] = 255;
				1371	}
				1372	}
				1373
				1374	void UYVYToARGBRow_C(const uint8* src_uyvy,
				1375	uint8* rgb_buf,
				1376	int width) {
				1377	int x;
				1378	for (x = 0; x < width - 1; x += 2) {
				1379	YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
				1380	rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
				1381	rgb_buf[3] = 255;
				1382	YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2],
				1383	rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
				1384	rgb_buf[7] = 255;
				1385	src_uyvy += 4;
				1386	rgb_buf += 8; // Advance 2 pixels.
				1387	}
				1388	if (width & 1) {
				1389	YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
				1390	rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
				1391	rgb_buf[3] = 255;
				1392	}
				1393	}
				1394
				1395	void I422ToBGRARow_C(const uint8* src_y,
				1396	const uint8* src_u,
				1397	const uint8* src_v,
				1398	uint8* rgb_buf,
				1399	int width) {
				1400	int x;
				1401	for (x = 0; x < width - 1; x += 2) {
				1402	YuvPixel(src_y[0], src_u[0], src_v[0],
				1403	rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);
				1404	rgb_buf[0] = 255;
				1405	YuvPixel(src_y[1], src_u[0], src_v[0],
				1406	rgb_buf + 7, rgb_buf + 6, rgb_buf + 5);
				1407	rgb_buf[4] = 255;
				1408	src_y += 2;
				1409	src_u += 1;
				1410	src_v += 1;
				1411	rgb_buf += 8; // Advance 2 pixels.
				1412	}
				1413	if (width & 1) {
				1414	YuvPixel(src_y[0], src_u[0], src_v[0],
				1415	rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);
				1416	rgb_buf[0] = 255;
				1417	}
				1418	}
				1419
				1420	void I422ToABGRRow_C(const uint8* src_y,
				1421	const uint8* src_u,
				1422	const uint8* src_v,
				1423	uint8* rgb_buf,
				1424	int width) {
				1425	int x;
				1426	for (x = 0; x < width - 1; x += 2) {
				1427	YuvPixel(src_y[0], src_u[0], src_v[0],
				1428	rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
				1429	rgb_buf[3] = 255;
				1430	YuvPixel(src_y[1], src_u[0], src_v[0],
				1431	rgb_buf + 6, rgb_buf + 5, rgb_buf + 4);
				1432	rgb_buf[7] = 255;
				1433	src_y += 2;
				1434	src_u += 1;
				1435	src_v += 1;
				1436	rgb_buf += 8; // Advance 2 pixels.
				1437	}
				1438	if (width & 1) {
				1439	YuvPixel(src_y[0], src_u[0], src_v[0],
				1440	rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
				1441	rgb_buf[3] = 255;
				1442	}
				1443	}
				1444
				1445	void I422ToRGBARow_C(const uint8* src_y,
				1446	const uint8* src_u,
				1447	const uint8* src_v,
				1448	uint8* rgb_buf,
				1449	int width) {
				1450	int x;
				1451	for (x = 0; x < width - 1; x += 2) {
				1452	YuvPixel(src_y[0], src_u[0], src_v[0],
				1453	rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);
				1454	rgb_buf[0] = 255;
				1455	YuvPixel(src_y[1], src_u[0], src_v[0],
				1456	rgb_buf + 5, rgb_buf + 6, rgb_buf + 7);
				1457	rgb_buf[4] = 255;
				1458	src_y += 2;
				1459	src_u += 1;
				1460	src_v += 1;
				1461	rgb_buf += 8; // Advance 2 pixels.
				1462	}
				1463	if (width & 1) {
				1464	YuvPixel(src_y[0], src_u[0], src_v[0],
				1465	rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);
				1466	rgb_buf[0] = 255;
				1467	}
				1468	}
				1469
				1470	void YToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
				1471	int x;
				1472	for (x = 0; x < width - 1; x += 2) {
				1473	YuvPixel(src_y[0], 128, 128,
				1474	rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
				1475	rgb_buf[3] = 255;
				1476	YuvPixel(src_y[1], 128, 128,
				1477	rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
				1478	rgb_buf[7] = 255;
				1479	src_y += 2;
				1480	rgb_buf += 8; // Advance 2 pixels.
				1481	}
				1482	if (width & 1) {
				1483	YuvPixel(src_y[0], 128, 128,
				1484	rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
				1485	rgb_buf[3] = 255;
				1486	}
				1487	}
				1488
				1489	void MirrorRow_C(const uint8* src, uint8* dst, int width) {
				1490	int x;
				1491	src += width - 1;
				1492	for (x = 0; x < width - 1; x += 2) {
				1493	dst[x] = src[0];
				1494	dst[x + 1] = src[-1];
				1495	src -= 2;
				1496	}
				1497	if (width & 1) {
				1498	dst[width - 1] = src[0];
				1499	}
				1500	}
				1501
				1502	void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
				1503	int x;
				1504	src_uv += (width - 1) << 1;
				1505	for (x = 0; x < width - 1; x += 2) {
				1506	dst_u[x] = src_uv[0];
				1507	dst_u[x + 1] = src_uv[-2];
				1508	dst_v[x] = src_uv[1];
				1509	dst_v[x + 1] = src_uv[-2 + 1];
				1510	src_uv -= 4;
				1511	}
				1512	if (width & 1) {
				1513	dst_u[width - 1] = src_uv[0];
				1514	dst_v[width - 1] = src_uv[1];
				1515	}
				1516	}
				1517
				1518	void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
				1519	int x;
				1520	const uint32* src32 = (const uint32*)(src);
				1521	uint32* dst32 = (uint32*)(dst);
				1522	src32 += width - 1;
				1523	for (x = 0; x < width - 1; x += 2) {
				1524	dst32[x] = src32[0];
				1525	dst32[x + 1] = src32[-1];
				1526	src32 -= 2;
				1527	}
				1528	if (width & 1) {
				1529	dst32[width - 1] = src32[0];
				1530	}
				1531	}
				1532
				1533	void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
				1534	int x;
				1535	for (x = 0; x < width - 1; x += 2) {
				1536	dst_u[x] = src_uv[0];
				1537	dst_u[x + 1] = src_uv[2];
				1538	dst_v[x] = src_uv[1];
				1539	dst_v[x + 1] = src_uv[3];
				1540	src_uv += 4;
				1541	}
				1542	if (width & 1) {
				1543	dst_u[width - 1] = src_uv[0];
				1544	dst_v[width - 1] = src_uv[1];
				1545	}
				1546	}
				1547
				1548	void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
				1549	int width) {
				1550	int x;
				1551	for (x = 0; x < width - 1; x += 2) {
				1552	dst_uv[0] = src_u[x];
				1553	dst_uv[1] = src_v[x];
				1554	dst_uv[2] = src_u[x + 1];
				1555	dst_uv[3] = src_v[x + 1];
				1556	dst_uv += 4;
				1557	}
				1558	if (width & 1) {
				1559	dst_uv[0] = src_u[width - 1];
				1560	dst_uv[1] = src_v[width - 1];
				1561	}
				1562	}
				1563
				1564	void CopyRow_C(const uint8* src, uint8* dst, int count) {
				1565	memcpy(dst, src, count);
				1566	}
				1567
				1568	void CopyRow_16_C(const uint16* src, uint16* dst, int count) {
				1569	memcpy(dst, src, count * 2);
				1570	}
				1571
				1572	void SetRow_C(uint8* dst, uint32 v8, int count) {
				1573	#ifdef _MSC_VER
				1574	// VC will generate rep stosb.
				1575	int x;
				1576	for (x = 0; x < count; ++x) {
				1577	dst[x] = v8;
				1578	}
				1579	#else
				1580	memset(dst, v8, count);
				1581	#endif
				1582	}
				1583
				1584	void ARGBSetRows_C(uint8* dst, uint32 v32, int width,
				1585	int dst_stride, int height) {
				1586	int y;
				1587	for (y = 0; y < height; ++y) {
				1588	uint32* d = (uint32*)(dst);
				1589	int x;
				1590	for (x = 0; x < width; ++x) {
				1591	d[x] = v32;
				1592	}
				1593	dst += dst_stride;
				1594	}
				1595	}
				1596
				1597	// Filter 2 rows of YUY2 UV's (422) into U and V (420).
				1598	void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
				1599	uint8* dst_u, uint8* dst_v, int width) {
				1600	// Output a row of UV values, filtering 2 rows of YUY2.
				1601	int x;
				1602	for (x = 0; x < width; x += 2) {
				1603	dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
				1604	dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
				1605	src_yuy2 += 4;
				1606	dst_u += 1;
				1607	dst_v += 1;
				1608	}
				1609	}
				1610
				1611	// Copy row of YUY2 UV's (422) into U and V (422).
				1612	void YUY2ToUV422Row_C(const uint8* src_yuy2,
				1613	uint8* dst_u, uint8* dst_v, int width) {
				1614	// Output a row of UV values.
				1615	int x;
				1616	for (x = 0; x < width; x += 2) {
				1617	dst_u[0] = src_yuy2[1];
				1618	dst_v[0] = src_yuy2[3];
				1619	src_yuy2 += 4;
				1620	dst_u += 1;
				1621	dst_v += 1;
				1622	}
				1623	}
				1624
				1625	// Copy row of YUY2 Y's (422) into Y (420/422).
				1626	void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
				1627	// Output a row of Y values.
				1628	int x;
				1629	for (x = 0; x < width - 1; x += 2) {
				1630	dst_y[x] = src_yuy2[0];
				1631	dst_y[x + 1] = src_yuy2[2];
				1632	src_yuy2 += 4;
				1633	}
				1634	if (width & 1) {
				1635	dst_y[width - 1] = src_yuy2[0];
				1636	}
				1637	}
				1638
				1639	// Filter 2 rows of UYVY UV's (422) into U and V (420).
				1640	void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
				1641	uint8* dst_u, uint8* dst_v, int width) {
				1642	// Output a row of UV values.
				1643	int x;
				1644	for (x = 0; x < width; x += 2) {
				1645	dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
				1646	dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
				1647	src_uyvy += 4;
				1648	dst_u += 1;
				1649	dst_v += 1;
				1650	}
				1651	}
				1652
				1653	// Copy row of UYVY UV's (422) into U and V (422).
				1654	void UYVYToUV422Row_C(const uint8* src_uyvy,
				1655	uint8* dst_u, uint8* dst_v, int width) {
				1656	// Output a row of UV values.
				1657	int x;
				1658	for (x = 0; x < width; x += 2) {
				1659	dst_u[0] = src_uyvy[0];
				1660	dst_v[0] = src_uyvy[2];
				1661	src_uyvy += 4;
				1662	dst_u += 1;
				1663	dst_v += 1;
				1664	}
				1665	}
				1666
				1667	// Copy row of UYVY Y's (422) into Y (420/422).
				1668	void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
				1669	// Output a row of Y values.
				1670	int x;
				1671	for (x = 0; x < width - 1; x += 2) {
				1672	dst_y[x] = src_uyvy[1];
				1673	dst_y[x + 1] = src_uyvy[3];
				1674	src_uyvy += 4;
				1675	}
				1676	if (width & 1) {
				1677	dst_y[width - 1] = src_uyvy[1];
				1678	}
				1679	}
				1680
				1681	#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f
				1682
				1683	// Blend src_argb0 over src_argb1 and store to dst_argb.
				1684	// dst_argb may be src_argb0 or src_argb1.
				1685	// This code mimics the SSSE3 version for better testability.
				1686	void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
				1687	uint8* dst_argb, int width) {
				1688	int x;
				1689	for (x = 0; x < width - 1; x += 2) {
				1690	uint32 fb = src_argb0[0];
				1691	uint32 fg = src_argb0[1];
				1692	uint32 fr = src_argb0[2];
				1693	uint32 a = src_argb0[3];
				1694	uint32 bb = src_argb1[0];
				1695	uint32 bg = src_argb1[1];
				1696	uint32 br = src_argb1[2];
				1697	dst_argb[0] = BLEND(fb, bb, a);
				1698	dst_argb[1] = BLEND(fg, bg, a);
				1699	dst_argb[2] = BLEND(fr, br, a);
				1700	dst_argb[3] = 255u;
				1701
				1702	fb = src_argb0[4 + 0];
				1703	fg = src_argb0[4 + 1];
				1704	fr = src_argb0[4 + 2];
				1705	a = src_argb0[4 + 3];
				1706	bb = src_argb1[4 + 0];
				1707	bg = src_argb1[4 + 1];
				1708	br = src_argb1[4 + 2];
				1709	dst_argb[4 + 0] = BLEND(fb, bb, a);
				1710	dst_argb[4 + 1] = BLEND(fg, bg, a);
				1711	dst_argb[4 + 2] = BLEND(fr, br, a);
				1712	dst_argb[4 + 3] = 255u;
				1713	src_argb0 += 8;
				1714	src_argb1 += 8;
				1715	dst_argb += 8;
				1716	}
				1717
				1718	if (width & 1) {
				1719	uint32 fb = src_argb0[0];
				1720	uint32 fg = src_argb0[1];
				1721	uint32 fr = src_argb0[2];
				1722	uint32 a = src_argb0[3];
				1723	uint32 bb = src_argb1[0];
				1724	uint32 bg = src_argb1[1];
				1725	uint32 br = src_argb1[2];
				1726	dst_argb[0] = BLEND(fb, bb, a);
				1727	dst_argb[1] = BLEND(fg, bg, a);
				1728	dst_argb[2] = BLEND(fr, br, a);
				1729	dst_argb[3] = 255u;
				1730	}
				1731	}
				1732	#undef BLEND
				1733	#define ATTENUATE(f, a) (a \| (a << 8)) * (f \| (f << 8)) >> 24
				1734
				1735	// Multiply source RGB by alpha and store to destination.
				1736	// This code mimics the SSSE3 version for better testability.
				1737	void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
				1738	int i;
				1739	for (i = 0; i < width - 1; i += 2) {
				1740	uint32 b = src_argb[0];
				1741	uint32 g = src_argb[1];
				1742	uint32 r = src_argb[2];
				1743	uint32 a = src_argb[3];
				1744	dst_argb[0] = ATTENUATE(b, a);
				1745	dst_argb[1] = ATTENUATE(g, a);
				1746	dst_argb[2] = ATTENUATE(r, a);
				1747	dst_argb[3] = a;
				1748	b = src_argb[4];
				1749	g = src_argb[5];
				1750	r = src_argb[6];
				1751	a = src_argb[7];
				1752	dst_argb[4] = ATTENUATE(b, a);
				1753	dst_argb[5] = ATTENUATE(g, a);
				1754	dst_argb[6] = ATTENUATE(r, a);
				1755	dst_argb[7] = a;
				1756	src_argb += 8;
				1757	dst_argb += 8;
				1758	}
				1759
				1760	if (width & 1) {
				1761	const uint32 b = src_argb[0];
				1762	const uint32 g = src_argb[1];
				1763	const uint32 r = src_argb[2];
				1764	const uint32 a = src_argb[3];
				1765	dst_argb[0] = ATTENUATE(b, a);
				1766	dst_argb[1] = ATTENUATE(g, a);
				1767	dst_argb[2] = ATTENUATE(r, a);
				1768	dst_argb[3] = a;
				1769	}
				1770	}
				1771	#undef ATTENUATE
				1772
				1773	// Divide source RGB by alpha and store to destination.
				1774	// b = (b * 255 + (a / 2)) / a;
				1775	// g = (g * 255 + (a / 2)) / a;
				1776	// r = (r * 255 + (a / 2)) / a;
				1777	// Reciprocal method is off by 1 on some values. ie 125
				1778	// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
				1779	#define T(a) 0x01000000 + (0x10000 / a)
				1780	const uint32 fixed_invtbl8[256] = {
				1781	0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
				1782	T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
				1783	T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
				1784	T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
				1785	T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
				1786	T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
				1787	T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
				1788	T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
				1789	T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
				1790	T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
				1791	T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
				1792	T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
				1793	T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
				1794	T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
				1795	T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
				1796	T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
				1797	T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
				1798	T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
				1799	T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
				1800	T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
				1801	T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
				1802	T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
				1803	T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
				1804	T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
				1805	T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
				1806	T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
				1807	T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
				1808	T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
				1809	T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
				1810	T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
				1811	T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
				1812	T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 };
				1813	#undef T
				1814
				1815	void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
				1816	int i;
				1817	for (i = 0; i < width; ++i) {
				1818	uint32 b = src_argb[0];
				1819	uint32 g = src_argb[1];
				1820	uint32 r = src_argb[2];
				1821	const uint32 a = src_argb[3];
				1822	const uint32 ia = fixed_invtbl8[a] & 0xffff; // 8.8 fixed point
				1823	b = (b * ia) >> 8;
				1824	g = (g * ia) >> 8;
				1825	r = (r * ia) >> 8;
				1826	// Clamping should not be necessary but is free in assembly.
				1827	dst_argb[0] = clamp255(b);
				1828	dst_argb[1] = clamp255(g);
				1829	dst_argb[2] = clamp255(r);
				1830	dst_argb[3] = a;
				1831	src_argb += 4;
				1832	dst_argb += 4;
				1833	}
				1834	}
				1835
				1836	void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
				1837	const int32* previous_cumsum, int width) {
				1838	int32 row_sum[4] = {0, 0, 0, 0};
				1839	int x;
				1840	for (x = 0; x < width; ++x) {
				1841	row_sum[0] += row[x * 4 + 0];
				1842	row_sum[1] += row[x * 4 + 1];
				1843	row_sum[2] += row[x * 4 + 2];
				1844	row_sum[3] += row[x * 4 + 3];
				1845	cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0];
				1846	cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1];
				1847	cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2];
				1848	cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3];
				1849	}
				1850	}
				1851
				1852	void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl,
				1853	int w, int area, uint8* dst, int count) {
				1854	float ooa = 1.0f / area;
				1855	int i;
				1856	for (i = 0; i < count; ++i) {
				1857	dst[0] = (uint8)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
				1858	dst[1] = (uint8)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
				1859	dst[2] = (uint8)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
				1860	dst[3] = (uint8)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
				1861	dst += 4;
				1862	tl += 4;
				1863	bl += 4;
				1864	}
				1865	}
				1866
				1867	// Copy pixels from rotated source to destination row with a slope.
				1868	LIBYUV_API
				1869	void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
				1870	uint8* dst_argb, const float* uv_dudv, int width) {
				1871	int i;
				1872	// Render a row of pixels from source into a buffer.
				1873	float uv[2];
				1874	uv[0] = uv_dudv[0];
				1875	uv[1] = uv_dudv[1];
				1876	for (i = 0; i < width; ++i) {
				1877	int x = (int)(uv[0]);
				1878	int y = (int)(uv[1]);
				1879	(uint32)(dst_argb) =
				1880	(const uint32)(src_argb + y * src_argb_stride +
				1881	x * 4);
				1882	dst_argb += 4;
				1883	uv[0] += uv_dudv[2];
				1884	uv[1] += uv_dudv[3];
				1885	}
				1886	}
				1887
				1888	// Blend 2 rows into 1 for conversions such as I422ToI420.
				1889	void HalfRow_C(const uint8* src_uv, int src_uv_stride,
				1890	uint8* dst_uv, int pix) {
				1891	int x;
				1892	for (x = 0; x < pix; ++x) {
				1893	dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
				1894	}
				1895	}
				1896
				1897	void HalfRow_16_C(const uint16* src_uv, int src_uv_stride,
				1898	uint16* dst_uv, int pix) {
				1899	int x;
				1900	for (x = 0; x < pix; ++x) {
				1901	dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
				1902	}
				1903	}
				1904
				1905	// C version 2x2 -> 2x1.
				1906	void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
				1907	ptrdiff_t src_stride,
				1908	int width, int source_y_fraction) {
				1909	int y1_fraction = source_y_fraction;
				1910	int y0_fraction = 256 - y1_fraction;
				1911	const uint8* src_ptr1 = src_ptr + src_stride;
				1912	int x;
				1913	if (source_y_fraction == 0) {
				1914	memcpy(dst_ptr, src_ptr, width);
				1915	return;
				1916	}
				1917	if (source_y_fraction == 128) {
				1918	HalfRow_C(src_ptr, (int)(src_stride), dst_ptr, width);
				1919	return;
				1920	}
				1921	for (x = 0; x < width - 1; x += 2) {
				1922	dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
				1923	dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
				1924	src_ptr += 2;
				1925	src_ptr1 += 2;
				1926	dst_ptr += 2;
				1927	}
				1928	if (width & 1) {
				1929	dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
				1930	}
				1931	}
				1932
				1933	void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
				1934	ptrdiff_t src_stride,
				1935	int width, int source_y_fraction) {
				1936	int y1_fraction = source_y_fraction;
				1937	int y0_fraction = 256 - y1_fraction;
				1938	const uint16* src_ptr1 = src_ptr + src_stride;
				1939	int x;
				1940	if (source_y_fraction == 0) {
				1941	memcpy(dst_ptr, src_ptr, width * 2);
				1942	return;
				1943	}
				1944	if (source_y_fraction == 128) {
				1945	HalfRow_16_C(src_ptr, (int)(src_stride), dst_ptr, width);
				1946	return;
				1947	}
				1948	for (x = 0; x < width - 1; x += 2) {
				1949	dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
				1950	dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
				1951	src_ptr += 2;
				1952	src_ptr1 += 2;
				1953	dst_ptr += 2;
				1954	}
				1955	if (width & 1) {
				1956	dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
				1957	}
				1958	}
				1959
				1960	// Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG
				1961	void ARGBToBayerRow_C(const uint8* src_argb,
				1962	uint8* dst_bayer, uint32 selector, int pix) {
				1963	int index0 = selector & 0xff;
				1964	int index1 = (selector >> 8) & 0xff;
				1965	// Copy a row of Bayer.
				1966	int x;
				1967	for (x = 0; x < pix - 1; x += 2) {
				1968	dst_bayer[0] = src_argb[index0];
				1969	dst_bayer[1] = src_argb[index1];
				1970	src_argb += 8;
				1971	dst_bayer += 2;
				1972	}
				1973	if (pix & 1) {
				1974	dst_bayer[0] = src_argb[index0];
				1975	}
				1976	}
				1977
				1978	// Select G channel from ARGB. e.g. GGGGGGGG
				1979	void ARGBToBayerGGRow_C(const uint8* src_argb,
				1980	uint8* dst_bayer, uint32 selector, int pix) {
				1981	// Copy a row of G.
				1982	int x;
				1983	for (x = 0; x < pix - 1; x += 2) {
				1984	dst_bayer[0] = src_argb[1];
				1985	dst_bayer[1] = src_argb[5];
				1986	src_argb += 8;
				1987	dst_bayer += 2;
				1988	}
				1989	if (pix & 1) {
				1990	dst_bayer[0] = src_argb[1];
				1991	}
				1992	}
				1993
				1994	// Use first 4 shuffler values to reorder ARGB channels.
				1995	void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
				1996	const uint8* shuffler, int pix) {
				1997	int index0 = shuffler[0];
				1998	int index1 = shuffler[1];
				1999	int index2 = shuffler[2];
				2000	int index3 = shuffler[3];
				2001	// Shuffle a row of ARGB.
				2002	int x;
				2003	for (x = 0; x < pix; ++x) {
				2004	// To support in-place conversion.
				2005	uint8 b = src_argb[index0];
				2006	uint8 g = src_argb[index1];
				2007	uint8 r = src_argb[index2];
				2008	uint8 a = src_argb[index3];
				2009	dst_argb[0] = b;
				2010	dst_argb[1] = g;
				2011	dst_argb[2] = r;
				2012	dst_argb[3] = a;
				2013	src_argb += 4;
				2014	dst_argb += 4;
				2015	}
				2016	}
				2017
				2018	void I422ToYUY2Row_C(const uint8* src_y,
				2019	const uint8* src_u,
				2020	const uint8* src_v,
				2021	uint8* dst_frame, int width) {
				2022	int x;
				2023	for (x = 0; x < width - 1; x += 2) {
				2024	dst_frame[0] = src_y[0];
				2025	dst_frame[1] = src_u[0];
				2026	dst_frame[2] = src_y[1];
				2027	dst_frame[3] = src_v[0];
				2028	dst_frame += 4;
				2029	src_y += 2;
				2030	src_u += 1;
				2031	src_v += 1;
				2032	}
				2033	if (width & 1) {
				2034	dst_frame[0] = src_y[0];
				2035	dst_frame[1] = src_u[0];
				2036	dst_frame[2] = src_y[0]; // duplicate last y
				2037	dst_frame[3] = src_v[0];
				2038	}
				2039	}
				2040
				2041	void I422ToUYVYRow_C(const uint8* src_y,
				2042	const uint8* src_u,
				2043	const uint8* src_v,
				2044	uint8* dst_frame, int width) {
				2045	int x;
				2046	for (x = 0; x < width - 1; x += 2) {
				2047	dst_frame[0] = src_u[0];
				2048	dst_frame[1] = src_y[0];
				2049	dst_frame[2] = src_v[0];
				2050	dst_frame[3] = src_y[1];
				2051	dst_frame += 4;
				2052	src_y += 2;
				2053	src_u += 1;
				2054	src_v += 1;
				2055	}
				2056	if (width & 1) {
				2057	dst_frame[0] = src_u[0];
				2058	dst_frame[1] = src_y[0];
				2059	dst_frame[2] = src_v[0];
				2060	dst_frame[3] = src_y[0]; // duplicate last y
				2061	}
				2062	}
				2063
				2064	#if !defined(LIBYUV_DISABLE_X86) && defined(HAS_I422TOARGBROW_SSSE3)
				2065	// row_win.cc has asm version, but GCC uses 2 step wrapper.
				2066	#if !defined(_MSC_VER) && (defined(__x86_64__) \|\| defined(__i386__))
				2067	void I422ToRGB565Row_SSSE3(const uint8* src_y,
				2068	const uint8* src_u,
				2069	const uint8* src_v,
				2070	uint8* rgb_buf,
				2071	int width) {
				2072	// Allocate a row of ARGB.
				2073	align_buffer_64(row, width * 4);
				2074	I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
				2075	ARGBToRGB565Row_SSE2(row, rgb_buf, width);
				2076	free_aligned_buffer_64(row);
				2077	}
				2078	#endif // !defined(_MSC_VER) && (defined(__x86_64__) \|\| defined(__i386__))
				2079
				2080	#if defined(_M_IX86) \|\| defined(__x86_64__) \|\| defined(__i386__)
				2081	void I422ToARGB1555Row_SSSE3(const uint8* src_y,
				2082	const uint8* src_u,
				2083	const uint8* src_v,
				2084	uint8* rgb_buf,
				2085	int width) {
				2086	// Allocate a row of ARGB.
				2087	align_buffer_64(row, width * 4);
				2088	I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
				2089	ARGBToARGB1555Row_SSE2(row, rgb_buf, width);
				2090	free_aligned_buffer_64(row);
				2091	}
				2092
				2093	void I422ToARGB4444Row_SSSE3(const uint8* src_y,
				2094	const uint8* src_u,
				2095	const uint8* src_v,
				2096	uint8* rgb_buf,
				2097	int width) {
				2098	// Allocate a row of ARGB.
				2099	align_buffer_64(row, width * 4);
				2100	I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
				2101	ARGBToARGB4444Row_SSE2(row, rgb_buf, width);
				2102	free_aligned_buffer_64(row);
				2103	}
				2104
				2105	void NV12ToRGB565Row_SSSE3(const uint8* src_y,
				2106	const uint8* src_uv,
				2107	uint8* dst_rgb565,
				2108	int width) {
				2109	// Allocate a row of ARGB.
				2110	align_buffer_64(row, width * 4);
				2111	NV12ToARGBRow_SSSE3(src_y, src_uv, row, width);
				2112	ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
				2113	free_aligned_buffer_64(row);
				2114	}
				2115
				2116	void NV21ToRGB565Row_SSSE3(const uint8* src_y,
				2117	const uint8* src_vu,
				2118	uint8* dst_rgb565,
				2119	int width) {
				2120	// Allocate a row of ARGB.
				2121	align_buffer_64(row, width * 4);
				2122	NV21ToARGBRow_SSSE3(src_y, src_vu, row, width);
				2123	ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
				2124	free_aligned_buffer_64(row);
				2125	}
				2126
				2127	void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
				2128	uint8* dst_argb,
				2129	int width) {
				2130	// Allocate a rows of yuv.
				2131	align_buffer_64(row_y, ((width + 63) & ~63) * 2);
				2132	uint8* row_u = row_y + ((width + 63) & ~63);
				2133	uint8* row_v = row_u + ((width + 63) & ~63) / 2;
				2134	YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, width);
				2135	YUY2ToYRow_SSE2(src_yuy2, row_y, width);
				2136	I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
				2137	free_aligned_buffer_64(row_y);
				2138	}
				2139
				2140	void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2,
				2141	uint8* dst_argb,
				2142	int width) {
				2143	// Allocate a rows of yuv.
				2144	align_buffer_64(row_y, ((width + 63) & ~63) * 2);
				2145	uint8* row_u = row_y + ((width + 63) & ~63);
				2146	uint8* row_v = row_u + ((width + 63) & ~63) / 2;
				2147	YUY2ToUV422Row_Unaligned_SSE2(src_yuy2, row_u, row_v, width);
				2148	YUY2ToYRow_Unaligned_SSE2(src_yuy2, row_y, width);
				2149	I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
				2150	free_aligned_buffer_64(row_y);
				2151	}
				2152
				2153	void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
				2154	uint8* dst_argb,
				2155	int width) {
				2156	// Allocate a rows of yuv.
				2157	align_buffer_64(row_y, ((width + 63) & ~63) * 2);
				2158	uint8* row_u = row_y + ((width + 63) & ~63);
				2159	uint8* row_v = row_u + ((width + 63) & ~63) / 2;
				2160	UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, width);
				2161	UYVYToYRow_SSE2(src_uyvy, row_y, width);
				2162	I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
				2163	free_aligned_buffer_64(row_y);
				2164	}
				2165
				2166	void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy,
				2167	uint8* dst_argb,
				2168	int width) {
				2169	// Allocate a rows of yuv.
				2170	align_buffer_64(row_y, ((width + 63) & ~63) * 2);
				2171	uint8* row_u = row_y + ((width + 63) & ~63);
				2172	uint8* row_v = row_u + ((width + 63) & ~63) / 2;
				2173	UYVYToUV422Row_Unaligned_SSE2(src_uyvy, row_u, row_v, width);
				2174	UYVYToYRow_Unaligned_SSE2(src_uyvy, row_y, width);
				2175	I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
				2176	free_aligned_buffer_64(row_y);
				2177	}
				2178
				2179	#endif // defined(_M_IX86) \|\| defined(__x86_64__) \|\| defined(__i386__)
				2180	#endif // !defined(LIBYUV_DISABLE_X86)
				2181
				2182	void ARGBPolynomialRow_C(const uint8* src_argb,
				2183	uint8* dst_argb, const float* poly,
				2184	int width) {
				2185	int i;
				2186	for (i = 0; i < width; ++i) {
				2187	float b = (float)(src_argb[0]);
				2188	float g = (float)(src_argb[1]);
				2189	float r = (float)(src_argb[2]);
				2190	float a = (float)(src_argb[3]);
				2191	float b2 = b * b;
				2192	float g2 = g * g;
				2193	float r2 = r * r;
				2194	float a2 = a * a;
				2195	float db = poly[0] + poly[4] * b;
				2196	float dg = poly[1] + poly[5] * g;
				2197	float dr = poly[2] + poly[6] * r;
				2198	float da = poly[3] + poly[7] * a;
				2199	float b3 = b2 * b;
				2200	float g3 = g2 * g;
				2201	float r3 = r2 * r;
				2202	float a3 = a2 * a;
				2203	db += poly[8] * b2;
				2204	dg += poly[9] * g2;
				2205	dr += poly[10] * r2;
				2206	da += poly[11] * a2;
				2207	db += poly[12] * b3;
				2208	dg += poly[13] * g3;
				2209	dr += poly[14] * r3;
				2210	da += poly[15] * a3;
				2211
				2212	dst_argb[0] = Clamp((int32)(db));
				2213	dst_argb[1] = Clamp((int32)(dg));
				2214	dst_argb[2] = Clamp((int32)(dr));
				2215	dst_argb[3] = Clamp((int32)(da));
				2216	src_argb += 4;
				2217	dst_argb += 4;
				2218	}
				2219	}
				2220
				2221	void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
				2222	const uint8* luma, uint32 lumacoeff) {
				2223	uint32 bc = lumacoeff & 0xff;
				2224	uint32 gc = (lumacoeff >> 8) & 0xff;
				2225	uint32 rc = (lumacoeff >> 16) & 0xff;
				2226
				2227	int i;
				2228	for (i = 0; i < width - 1; i += 2) {
				2229	// Luminance in rows, color values in columns.
				2230	const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
				2231	src_argb[2] * rc) & 0x7F00u) + luma;
				2232	const uint8* luma1;
				2233	dst_argb[0] = luma0[src_argb[0]];
				2234	dst_argb[1] = luma0[src_argb[1]];
				2235	dst_argb[2] = luma0[src_argb[2]];
				2236	dst_argb[3] = src_argb[3];
				2237	luma1 = ((src_argb[4] * bc + src_argb[5] * gc +
				2238	src_argb[6] * rc) & 0x7F00u) + luma;
				2239	dst_argb[4] = luma1[src_argb[4]];
				2240	dst_argb[5] = luma1[src_argb[5]];
				2241	dst_argb[6] = luma1[src_argb[6]];
				2242	dst_argb[7] = src_argb[7];
				2243	src_argb += 8;
				2244	dst_argb += 8;
				2245	}
				2246	if (width & 1) {
				2247	// Luminance in rows, color values in columns.
				2248	const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
				2249	src_argb[2] * rc) & 0x7F00u) + luma;
				2250	dst_argb[0] = luma0[src_argb[0]];
				2251	dst_argb[1] = luma0[src_argb[1]];
				2252	dst_argb[2] = luma0[src_argb[2]];
				2253	dst_argb[3] = src_argb[3];
				2254	}
				2255	}
				2256
				2257	void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {
				2258	int i;
				2259	for (i = 0; i < width - 1; i += 2) {
				2260	dst[3] = src[3];
				2261	dst[7] = src[7];
				2262	dst += 8;
				2263	src += 8;
				2264	}
				2265	if (width & 1) {
				2266	dst[3] = src[3];
				2267	}
				2268	}
				2269
				2270	void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {
				2271	int i;
				2272	for (i = 0; i < width - 1; i += 2) {
				2273	dst[3] = src[0];
				2274	dst[7] = src[1];
				2275	dst += 8;
				2276	src += 2;
				2277	}
				2278	if (width & 1) {
				2279	dst[3] = src[0];
				2280	}
				2281	}
				2282
				2283	#ifdef __cplusplus
				2284	} // extern "C"
				2285	} // namespace libyuv
				2286	#endif