Merge "vpxdec: restoring old md5 behavior for y4m files."
diff --git a/build/make/obj_int_extract.c b/build/make/obj_int_extract.c
index 495e9d7..dbd8c9e 100644
--- a/build/make/obj_int_extract.c
+++ b/build/make/obj_int_extract.c
@@ -21,6 +21,7 @@
OUTPUT_FMT_PLAIN,
OUTPUT_FMT_RVDS,
OUTPUT_FMT_GAS,
+ OUTPUT_FMT_C_HEADER,
} output_fmt_t;
int log_msg(const char *fmt, ...) {
@@ -46,6 +47,9 @@
case OUTPUT_FMT_GAS:
printf(".set %-40s, %5d\n", name, val);
return 0;
+ case OUTPUT_C_HEADER:
+ printf("#define %-40s %5d\n", name, val);
+ return 0;
default:
log_msg("Unsupported mode: %d", mode);
return 1;
@@ -491,6 +495,13 @@
sym.st_name),
val);
break;
+ case OUTPUT_FMT_C_HEADER:
+ printf("#define %-40s %5d\n",
+ parse_elf_string_table(&elf,
+ shdr.sh_link,
+ sym.st_name),
+ val);
+ break;
default:
printf("%s = %d\n",
parse_elf_string_table(&elf,
@@ -762,6 +773,7 @@
fprintf(stderr, "Output Formats:\n");
fprintf(stderr, " gas - compatible with GNU assembler\n");
fprintf(stderr, " rvds - compatible with armasm\n");
+ fprintf(stderr, " cheader - c/c++ header file\n");
goto bail;
}
@@ -771,6 +783,8 @@
mode = OUTPUT_FMT_RVDS;
else if (!strcmp(argv[1], "gas"))
mode = OUTPUT_FMT_GAS;
+ else if (!strcmp(argv[1], "cheader"))
+ mode = OUTPUT_FMT_C_HEADER;
else
f = argv[1];
diff --git a/examples.mk b/examples.mk
index 2337d1e..6940353 100644
--- a/examples.mk
+++ b/examples.mk
@@ -64,14 +64,20 @@
vp9_spatial_scalable_encoder.GUID = 4A38598D-627D-4505-9C7B-D4020C84100D
vp9_spatial_scalable_encoder.DESCRIPTION = Spatial Scalable Encoder
+ifeq ($(CONFIG_SHARED),no)
+UTILS-$(CONFIG_VP9_ENCODER) += resize_util.c
+endif
+
# XMA example disabled for now, not used in VP8
#UTILS-$(CONFIG_DECODERS) += example_xma.c
#example_xma.GUID = A955FC4A-73F1-44F7-135E-30D84D32F022
#example_xma.DESCRIPTION = External Memory Allocation mode usage
GEN_EXAMPLES-$(CONFIG_VP8_DECODER) += simple_decoder.c
-simple_decoder.GUID = D3BBF1E9-2427-450D-BBFF-B2843C1D44CC
-simple_decoder.DESCRIPTION = Simplified decoder loop
+simple_decoder.GUID = D3BBF1E9-2427-450D-BBFF-B2843C1D44CC
+simple_decoder.SRCS += ivfdec.h ivfdec.c
+simple_decoder.SRCS += tools_common.h tools_common.c
+simple_decoder.DESCRIPTION = Simplified decoder loop
GEN_EXAMPLES-$(CONFIG_VP8_DECODER) += postproc.c
postproc.GUID = 65E33355-F35E-4088-884D-3FD4905881D7
postproc.DESCRIPTION = Decoder postprocessor control
diff --git a/examples/decode_to_md5.c b/examples/decode_to_md5.c
index 67cbd6c..bba2182 100644
--- a/examples/decode_to_md5.c
+++ b/examples/decode_to_md5.c
@@ -29,7 +29,6 @@
// is processed, then U, then V. It is important to honor the image's `stride`
// values.
-#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -44,28 +43,6 @@
#include "./tools_common.h"
#include "./vpx_config.h"
-#define VP8_FOURCC 0x30385056
-#define VP9_FOURCC 0x30395056
-
-static vpx_codec_iface_t *get_codec_interface(unsigned int fourcc) {
- switch (fourcc) {
- case VP8_FOURCC:
- return vpx_codec_vp8_dx();
- case VP9_FOURCC:
- return vpx_codec_vp9_dx();
- }
- return NULL;
-}
-
-static void die_codec(vpx_codec_ctx_t *ctx, const char *s) {
- const char *detail = vpx_codec_error_detail(ctx);
-
- printf("%s: %s\n", s, vpx_codec_error(ctx));
- if(detail)
- printf(" %s\n",detail);
- exit(EXIT_FAILURE);
-}
-
static void get_image_md5(const vpx_image_t *img, unsigned char digest[16]) {
int plane, y;
MD5Context md5;
diff --git a/examples/simple_decoder.c b/examples/simple_decoder.c
index 17a5987..23399f4 100644
--- a/examples/simple_decoder.c
+++ b/examples/simple_decoder.c
@@ -77,110 +77,82 @@
// few exeptions, vpx_codec functions return an enumerated error status,
// with the value `0` indicating success.
-#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+
#define VPX_CODEC_DISABLE_COMPAT 1
-#include "./vpx_config.h"
+
#include "vpx/vp8dx.h"
#include "vpx/vpx_decoder.h"
-#define interface (vpx_codec_vp8_dx())
+#include "./ivfdec.h"
+#include "./tools_common.h"
+#include "./vpx_config.h"
-#define IVF_FILE_HDR_SZ (32)
-#define IVF_FRAME_HDR_SZ (12)
+static const char *exec_name;
-static unsigned int mem_get_le32(const unsigned char *mem) {
- return (mem[3] << 24)|(mem[2] << 16)|(mem[1] << 8)|(mem[0]);
+void usage_exit() {
+ fprintf(stderr, "Usage: %s <infile> <outfile>\n", exec_name);
+ exit(EXIT_FAILURE);
}
-static void die(const char *fmt, ...) {
- va_list ap;
-
- va_start(ap, fmt);
- vprintf(fmt, ap);
- if(fmt[strlen(fmt)-1] != '\n')
- printf("\n");
- exit(EXIT_FAILURE);
-}
-
-static void die_codec(vpx_codec_ctx_t *ctx, const char *s) {
- const char *detail = vpx_codec_error_detail(ctx);
-
- printf("%s: %s\n", s, vpx_codec_error(ctx));
- if(detail)
- printf(" %s\n",detail);
- exit(EXIT_FAILURE);
-}
-
-
int main(int argc, char **argv) {
- FILE *infile, *outfile;
- vpx_codec_ctx_t codec;
- int flags = 0, frame_cnt = 0;
- unsigned char file_hdr[IVF_FILE_HDR_SZ];
- unsigned char frame_hdr[IVF_FRAME_HDR_SZ];
- unsigned char frame[256*1024];
- vpx_codec_err_t res;
+ FILE *infile, *outfile;
+ vpx_codec_ctx_t codec;
+ vpx_codec_iface_t *iface;
+ int flags = 0, frame_cnt = 0;
+ vpx_video_t *video;
- (void)res;
- /* Open files */
- if(argc!=3)
- die("Usage: %s <infile> <outfile>\n", argv[0]);
- if(!(infile = fopen(argv[1], "rb")))
- die("Failed to open %s for reading", argv[1]);
- if(!(outfile = fopen(argv[2], "wb")))
- die("Failed to open %s for writing", argv[2]);
+ exec_name = argv[0];
- /* Read file header */
- if(!(fread(file_hdr, 1, IVF_FILE_HDR_SZ, infile) == IVF_FILE_HDR_SZ
- && file_hdr[0]=='D' && file_hdr[1]=='K' && file_hdr[2]=='I'
- && file_hdr[3]=='F'))
- die("%s is not an IVF file.", argv[1]);
+ if (argc != 3)
+ die("Invalid number of arguments");
- printf("Using %s\n",vpx_codec_iface_name(interface));
- /* Initialize codec */
- if(vpx_codec_dec_init(&codec, interface, NULL, flags))
- die_codec(&codec, "Failed to initialize decoder");
+ if (!(infile = fopen(argv[1], "rb")))
+ die("Failed to open %s for reading", argv[1]);
- /* Read each frame */
- while(fread(frame_hdr, 1, IVF_FRAME_HDR_SZ, infile) == IVF_FRAME_HDR_SZ) {
- int frame_sz = mem_get_le32(frame_hdr);
- vpx_codec_iter_t iter = NULL;
- vpx_image_t *img;
+ if (!(outfile = fopen(argv[2], "wb")))
+ die("Failed to open %s for writing", argv[2]);
+ video = vpx_video_open_file(infile);
+ if (!video)
+ die("%s is not an IVF file.", argv[1]);
- frame_cnt++;
- if(frame_sz > sizeof(frame))
- die("Frame %d data too big for example code buffer", frame_sz);
- if(fread(frame, 1, frame_sz, infile) != frame_sz)
- die("Frame %d failed to read complete frame", frame_cnt);
+ iface = get_codec_interface(vpx_video_get_fourcc(video));
+ if (!iface)
+ die("Unknown FOURCC code.");
- /* Decode the frame */
- if(vpx_codec_decode(&codec, frame, frame_sz, NULL, 0))
- die_codec(&codec, "Failed to decode frame");
+ printf("Using %s\n", vpx_codec_iface_name(iface));
- /* Write decoded data to disk */
- while((img = vpx_codec_get_frame(&codec, &iter))) {
- unsigned int plane, y;
+ if (vpx_codec_dec_init(&codec, iface, NULL, flags))
+ die_codec(&codec, "Failed to initialize decoder");
- for(plane=0; plane < 3; plane++) {
- unsigned char *buf =img->planes[plane];
+ while (vpx_video_read_frame(video)) {
+ vpx_codec_iter_t iter = NULL;
+ vpx_image_t *img = NULL;
+ size_t frame_size = 0;
+ const unsigned char *frame = vpx_video_get_frame(video, &frame_size);
+ if (vpx_codec_decode(&codec, frame, frame_size, NULL, 0))
+ die_codec(&codec, "Failed to decode frame");
- for(y=0; y < (plane ? (img->d_h + 1) >> 1 : img->d_h); y++) {
- (void) fwrite(buf, 1, (plane ? (img->d_w + 1) >> 1 : img->d_w),
- outfile);
- buf += img->stride[plane];
- }
- }
- }
+ while ((img = vpx_codec_get_frame(&codec, &iter)) != NULL) {
+ vpx_img_write(img, outfile);
+ ++frame_cnt;
}
- printf("Processed %d frames.\n",frame_cnt);
- if(vpx_codec_destroy(&codec))
- die_codec(&codec, "Failed to destroy codec");
+ }
- fclose(outfile);
- fclose(infile);
- return EXIT_SUCCESS;
+ printf("Processed %d frames.\n", frame_cnt);
+ if (vpx_codec_destroy(&codec))
+ die_codec(&codec, "Failed to destroy codec");
+
+ printf("Play: ffplay -f rawvideo -pix_fmt yuv420p -s %dx%d %s\n",
+ vpx_video_get_width(video), vpx_video_get_height(video), argv[2]);
+
+ vpx_video_close(video);
+
+ fclose(outfile);
+ fclose(infile);
+
+ return EXIT_SUCCESS;
}
diff --git a/resize_util.c b/resize_util.c
new file mode 100644
index 0000000..b068f55
--- /dev/null
+++ b/resize_util.c
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "./vp9/encoder/vp9_resize.h"
+
+static void usage(char *progname) {
+ printf("Usage:\n");
+ printf("%s <input_yuv> <width>x<height> <target_width>x<target_height> ",
+ progname);
+ printf("<output_yuv> [<frames>]\n");
+}
+
+static int parse_dim(char *v, int *width, int *height) {
+ char *x = strchr(v, 'x');
+ if (x == NULL)
+ x = strchr(v, 'X');
+ if (x == NULL)
+ return 0;
+ *width = atoi(v);
+ *height = atoi(&x[1]);
+ if (*width <= 0 || *height <= 0)
+ return 0;
+ else
+ return 1;
+}
+
+int main(int argc, char *argv[]) {
+ char *fin, *fout;
+ FILE *fpin, *fpout;
+ uint8_t *inbuf, *outbuf;
+ uint8_t *inbuf_u, *outbuf_u;
+ uint8_t *inbuf_v, *outbuf_v;
+ int f, frames;
+ int width, height, target_width, target_height;
+
+ if (argc < 5) {
+ printf("Incorrect parameters:\n");
+ usage(argv[0]);
+ return 1;
+ }
+
+ fin = argv[1];
+ fout = argv[4];
+ if (!parse_dim(argv[2], &width, &height)) {
+ printf("Incorrect parameters: %s\n", argv[2]);
+ usage(argv[0]);
+ return 1;
+ }
+ if (!parse_dim(argv[3], &target_width, &target_height)) {
+ printf("Incorrect parameters: %s\n", argv[3]);
+ usage(argv[0]);
+ return 1;
+ }
+
+ fpin = fopen(fin, "rb");
+ if (fpin == NULL) {
+ printf("Can't open file %s to read\n", fin);
+ usage(argv[0]);
+ return 1;
+ }
+ fpout = fopen(fout, "wb");
+ if (fpout == NULL) {
+ printf("Can't open file %s to write\n", fout);
+ usage(argv[0]);
+ return 1;
+ }
+ if (argc >= 6)
+ frames = atoi(argv[5]);
+ else
+ frames = INT_MAX;
+
+ printf("Input size: %dx%d\n",
+ width, height);
+ printf("Target size: %dx%d, Frames: ",
+ target_width, target_height);
+ if (frames == INT_MAX)
+ printf("All\n");
+ else
+ printf("%d\n", frames);
+
+ inbuf = (uint8_t*)malloc(width * height * 3 / 2);
+ outbuf = (uint8_t*)malloc(target_width * target_height * 3 / 2);
+ inbuf_u = inbuf + width * height;
+ inbuf_v = inbuf_u + width * height / 4;
+ outbuf_u = outbuf + target_width * target_height;
+ outbuf_v = outbuf_u + target_width * target_height / 4;
+ f = 0;
+ while (f < frames) {
+ if (fread(inbuf, width * height * 3 / 2, 1, fpin) != 1)
+ break;
+ vp9_resize_frame420(inbuf, width, inbuf_u, inbuf_v, width / 2,
+ height, width,
+ outbuf, target_width, outbuf_u, outbuf_v,
+ target_width / 2,
+ target_height, target_width);
+ fwrite(outbuf, target_width * target_height * 3 / 2, 1, fpout);
+ f++;
+ }
+ printf("%d frames processed\n", f);
+ fclose(fpin);
+ fclose(fpout);
+
+ free(inbuf);
+ free(outbuf);
+ return 0;
+}
diff --git a/tools_common.c b/tools_common.c
index 9c24983..85bedc9 100644
--- a/tools_common.c
+++ b/tools_common.c
@@ -15,6 +15,10 @@
#include <stdlib.h>
#include <string.h>
+#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER
+#include "vpx/vp8dx.h"
+#endif
+
#if defined(_WIN32) || defined(__OS2__)
#include <io.h>
#include <fcntl.h>
@@ -60,6 +64,15 @@
LOG_ERROR("Warning");
}
+void die_codec(vpx_codec_ctx_t *ctx, const char *s) {
+ const char *detail = vpx_codec_error_detail(ctx);
+
+ printf("%s: %s\n", s, vpx_codec_error(ctx));
+ if (detail)
+ printf(" %s\n", detail);
+ exit(EXIT_FAILURE);
+}
+
uint16_t mem_get_le16(const void *data) {
uint16_t val;
const uint8_t *mem = (const uint8_t*)data;
@@ -130,3 +143,34 @@
return shortread;
}
+
+vpx_codec_iface_t *get_codec_interface(unsigned int fourcc) {
+ switch (fourcc) {
+#if CONFIG_VP8_DECODER
+ case VP8_FOURCC:
+ return vpx_codec_vp8_dx();
+#endif
+#if CONFIG_VP9_DECODER
+ case VP9_FOURCC:
+ return vpx_codec_vp9_dx();
+#endif
+ default:
+ return NULL;
+ }
+ return NULL;
+}
+
+void vpx_img_write(const vpx_image_t *img, FILE *file) {
+ int plane, y;
+
+ for (plane = 0; plane < 3; ++plane) {
+ const unsigned char *buf = img->planes[plane];
+ const int stride = img->stride[plane];
+ const int w = plane ? (img->d_w + 1) >> 1 : img->d_w;
+ const int h = plane ? (img->d_h + 1) >> 1 : img->d_h;
+ for (y = 0; y < h; ++y) {
+ fwrite(buf, 1, w, file);
+ buf += stride;
+ }
+ }
+}
diff --git a/tools_common.h b/tools_common.h
index 1d70ab5..967b7a1 100644
--- a/tools_common.h
+++ b/tools_common.h
@@ -13,6 +13,7 @@
#include <stdio.h>
#include "./vpx_config.h"
+#include "vpx/vpx_codec.h"
#include "vpx/vpx_image.h"
#include "vpx/vpx_integer.h"
@@ -112,6 +113,8 @@
void fatal(const char *fmt, ...);
void warn(const char *fmt, ...);
+void die_codec(vpx_codec_ctx_t *ctx, const char *s);
+
/* The tool including this file must define usage_exit() */
void usage_exit();
@@ -120,6 +123,12 @@
int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame);
+vpx_codec_iface_t *get_codec_interface(unsigned int fourcc);
+
+// TODO(dkovalev): move this function to vpx_image.{c, h}, so it will be part
+// of vpx_image_t support
+void vpx_img_write(const vpx_image_t *img, FILE *file);
+
#ifdef __cplusplus
} /* extern "C" */
#endif
diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.asm b/vp9/common/arm/neon/vp9_reconintra_neon.asm
index 65d087a..279f678 100644
--- a/vp9/common/arm/neon/vp9_reconintra_neon.asm
+++ b/vp9/common/arm/neon/vp9_reconintra_neon.asm
@@ -19,6 +19,7 @@
EXPORT |vp9_tm_predictor_4x4_neon|
EXPORT |vp9_tm_predictor_8x8_neon|
EXPORT |vp9_tm_predictor_16x16_neon|
+ EXPORT |vp9_tm_predictor_32x32_neon|
ARM
REQUIRE8
PRESERVE8
@@ -347,29 +348,32 @@
ldrb r12, [r12]
vdup.u8 d0, r12
+ ; preload 8 left
+ vld1.8 d30, [r3]
+
; Load above 8 pixels
vld1.64 {d2}, [r2]
+ vmovl.u8 q10, d30
+
; Compute above - ytop_left
vsubl.u8 q3, d2, d0
; Load left row by row and compute left + (above - ytop_left)
- vld1.u8 {d6}, [r3]
-
; 1st row and 2nd row
- vdup.u8 d0, d6[0]
- vdup.u8 d1, d6[1]
- vaddw.s16 q1, q3, d0
- vaddw.s16 q2, q3, d1
+ vdup.16 q0, d20[0]
+ vdup.16 q1, d20[1]
+ vadd.s16 q0, q3, q0
+ vadd.s16 q1, q3, q1
; 3rd row and 4th row
- vdup.u8 d0, d6[2]
- vdup.u8 d1, d6[3]
- vaddw.s16 q8, q3, d0
- vaddw.s16 q9, q3, d1
+ vdup.16 q8, d20[2]
+ vdup.16 q9, d20[3]
+ vadd.s16 q8, q3, q8
+ vadd.s16 q9, q3, q9
- vqshrun.s16 d0, q1, #0
- vqshrun.s16 d1, q2, #0
+ vqshrun.s16 d0, q0, #0
+ vqshrun.s16 d1, q1, #0
vqshrun.s16 d2, q8, #0
vqshrun.s16 d3, q9, #0
@@ -379,19 +383,19 @@
vst1.64 {d3}, [r0], r1
; 5th row and 6th row
- vdup.u8 d0, d6[4]
- vdup.u8 d1, d6[5]
- vaddw.s16 q1, q3, d0
- vaddw.s16 q2, q3, d1
+ vdup.16 q0, d21[0]
+ vdup.16 q1, d21[1]
+ vadd.s16 q0, q3, q0
+ vadd.s16 q1, q3, q1
- ; 7rd row and 8th row
- vdup.u8 d0, d6[6]
- vdup.u8 d1, d6[7]
- vaddw.s16 q8, q3, d0
- vaddw.s16 q9, q3, d1
+ ; 7th row and 8th row
+ vdup.16 q8, d21[2]
+ vdup.16 q9, d21[3]
+ vadd.s16 q8, q3, q8
+ vadd.s16 q9, q3, q9
- vqshrun.s16 d0, q1, #0
- vqshrun.s16 d1, q2, #0
+ vqshrun.s16 d0, q0, #0
+ vqshrun.s16 d1, q1, #0
vqshrun.s16 d2, q8, #0
vqshrun.s16 d3, q9, #0
@@ -499,4 +503,134 @@
bx lr
ENDP ; |vp9_tm_predictor_16x16_neon|
+;void vp9_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vp9_tm_predictor_32x32_neon| PROC
+ ; Load ytop_left = above[-1];
+ sub r12, r2, #1
+ ldrb r12, [r12]
+ vdup.u8 q0, r12
+
+ ; Load above 32 pixels
+ vld1.8 q1, [r2]!
+ vld1.8 q2, [r2]
+
+ ; preload 8 left pixels
+ vld1.8 d26, [r3]!
+
+ ; Compute above - ytop_left
+ vsubl.u8 q8, d2, d0
+ vsubl.u8 q9, d3, d1
+ vsubl.u8 q10, d4, d0
+ vsubl.u8 q11, d5, d1
+
+ vmovl.u8 q3, d26
+
+ ; Load left row by row and compute left + (above - ytop_left)
+ ; Process 8 rows in each single loop and loop 4 times to process 32 rows.
+ mov r2, #4
+
+loop_32x32_neon
+ ; Process two rows.
+ vdup.16 q0, d6[0]
+ vdup.16 q2, d6[1]
+ vadd.s16 q12, q0, q8
+ vadd.s16 q13, q0, q9
+ vadd.s16 q14, q0, q10
+ vadd.s16 q15, q0, q11
+ vqshrun.s16 d0, q12, #0
+ vqshrun.s16 d1, q13, #0
+ vadd.s16 q12, q2, q8
+ vadd.s16 q13, q2, q9
+ vqshrun.s16 d2, q14, #0
+ vqshrun.s16 d3, q15, #0
+ vadd.s16 q14, q2, q10
+ vadd.s16 q15, q2, q11
+ vst1.64 {d0-d3}, [r0], r1
+ vqshrun.s16 d24, q12, #0
+ vqshrun.s16 d25, q13, #0
+ vqshrun.s16 d26, q14, #0
+ vqshrun.s16 d27, q15, #0
+ vdup.16 q1, d6[2]
+ vdup.16 q2, d6[3]
+ vst1.64 {d24-d27}, [r0], r1
+
+ ; Process two rows.
+ vadd.s16 q12, q1, q8
+ vadd.s16 q13, q1, q9
+ vadd.s16 q14, q1, q10
+ vadd.s16 q15, q1, q11
+ vqshrun.s16 d0, q12, #0
+ vqshrun.s16 d1, q13, #0
+ vadd.s16 q12, q2, q8
+ vadd.s16 q13, q2, q9
+ vqshrun.s16 d2, q14, #0
+ vqshrun.s16 d3, q15, #0
+ vadd.s16 q14, q2, q10
+ vadd.s16 q15, q2, q11
+ vst1.64 {d0-d3}, [r0], r1
+ vqshrun.s16 d24, q12, #0
+ vqshrun.s16 d25, q13, #0
+ vqshrun.s16 d26, q14, #0
+ vqshrun.s16 d27, q15, #0
+ vdup.16 q0, d7[0]
+ vdup.16 q2, d7[1]
+ vst1.64 {d24-d27}, [r0], r1
+
+ ; Process two rows.
+ vadd.s16 q12, q0, q8
+ vadd.s16 q13, q0, q9
+ vadd.s16 q14, q0, q10
+ vadd.s16 q15, q0, q11
+ vqshrun.s16 d0, q12, #0
+ vqshrun.s16 d1, q13, #0
+ vadd.s16 q12, q2, q8
+ vadd.s16 q13, q2, q9
+ vqshrun.s16 d2, q14, #0
+ vqshrun.s16 d3, q15, #0
+ vadd.s16 q14, q2, q10
+ vadd.s16 q15, q2, q11
+ vst1.64 {d0-d3}, [r0], r1
+ vqshrun.s16 d24, q12, #0
+ vqshrun.s16 d25, q13, #0
+ vqshrun.s16 d26, q14, #0
+ vqshrun.s16 d27, q15, #0
+ vdup.16 q0, d7[2]
+ vdup.16 q2, d7[3]
+ vst1.64 {d24-d27}, [r0], r1
+
+ ; Process two rows.
+ vadd.s16 q12, q0, q8
+ vadd.s16 q13, q0, q9
+ vadd.s16 q14, q0, q10
+ vadd.s16 q15, q0, q11
+ vqshrun.s16 d0, q12, #0
+ vqshrun.s16 d1, q13, #0
+ vadd.s16 q12, q2, q8
+ vadd.s16 q13, q2, q9
+ vqshrun.s16 d2, q14, #0
+ vqshrun.s16 d3, q15, #0
+ vadd.s16 q14, q2, q10
+ vadd.s16 q15, q2, q11
+ vst1.64 {d0-d3}, [r0], r1
+ vqshrun.s16 d24, q12, #0
+ vqshrun.s16 d25, q13, #0
+ vld1.8 d0, [r3]! ; preload 8 left pixels
+ vqshrun.s16 d26, q14, #0
+ vqshrun.s16 d27, q15, #0
+ vmovl.u8 q3, d0
+ vst1.64 {d24-d27}, [r0], r1
+
+ subs r2, r2, #1
+ bgt loop_32x32_neon
+
+ bx lr
+ ENDP ; |vp9_tm_predictor_32x32_neon|
+
END
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 39ea492..9e16d8f 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -252,7 +252,7 @@
/* Inverse transform function pointers. */
void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob);
- struct subpix_fn_table subpix;
+ const interp_kernel *interp_kernel;
int corrupted;
diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h
index c8a0b61..b611e30 100644
--- a/vp9/common/vp9_filter.h
+++ b/vp9/common/vp9_filter.h
@@ -35,11 +35,6 @@
typedef int16_t interp_kernel[SUBPEL_TAPS];
-struct subpix_fn_table {
- const interp_kernel *filter_x;
- const interp_kernel *filter_y;
-};
-
const interp_kernel *vp9_get_interp_kernel(INTERP_FILTER filter);
extern const interp_kernel vp9_bilinear_filters[SUBPEL_SHIFTS];
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 533f7f3..20b78bf 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -96,7 +96,7 @@
}
}
-static void idct4_1d(const int16_t *input, int16_t *output) {
+static void idct4(const int16_t *input, int16_t *output) {
int16_t step[4];
int temp1, temp2;
// stage 1
@@ -124,7 +124,7 @@
// Rows
for (i = 0; i < 4; ++i) {
- idct4_1d(input, outptr);
+ idct4(input, outptr);
input += 4;
outptr += 4;
}
@@ -133,7 +133,7 @@
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
temp_in[j] = out[j * 4 + i];
- idct4_1d(temp_in, temp_out);
+ idct4(temp_in, temp_out);
for (j = 0; j < 4; ++j)
dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+ dest[j * stride + i]);
@@ -156,7 +156,7 @@
}
}
-static void idct8_1d(const int16_t *input, int16_t *output) {
+static void idct8(const int16_t *input, int16_t *output) {
int16_t step1[8], step2[8];
int temp1, temp2;
// stage 1
@@ -174,7 +174,7 @@
step1[6] = dct_const_round_shift(temp2);
// stage 2 & stage 3 - even half
- idct4_1d(step1, step1);
+ idct4(step1, step1);
// stage 2 - odd half
step2[4] = step1[4] + step1[5];
@@ -209,7 +209,7 @@
// First transform rows
for (i = 0; i < 8; ++i) {
- idct8_1d(input, outptr);
+ idct8(input, outptr);
input += 8;
outptr += 8;
}
@@ -218,7 +218,7 @@
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j)
temp_in[j] = out[j * 8 + i];
- idct8_1d(temp_in, temp_out);
+ idct8(temp_in, temp_out);
for (j = 0; j < 8; ++j)
dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ dest[j * stride + i]);
@@ -238,7 +238,7 @@
}
}
-static void iadst4_1d(const int16_t *input, int16_t *output) {
+static void iadst4(const int16_t *input, int16_t *output) {
int s0, s1, s2, s3, s4, s5, s6, s7;
int x0 = input[0];
@@ -283,10 +283,10 @@
void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
int tx_type) {
const transform_2d IHT_4[] = {
- { idct4_1d, idct4_1d }, // DCT_DCT = 0
- { iadst4_1d, idct4_1d }, // ADST_DCT = 1
- { idct4_1d, iadst4_1d }, // DCT_ADST = 2
- { iadst4_1d, iadst4_1d } // ADST_ADST = 3
+ { idct4, idct4 }, // DCT_DCT = 0
+ { iadst4, idct4 }, // ADST_DCT = 1
+ { idct4, iadst4 }, // DCT_ADST = 2
+ { iadst4, iadst4 } // ADST_ADST = 3
};
int i, j;
@@ -311,7 +311,7 @@
+ dest[j * stride + i]);
}
}
-static void iadst8_1d(const int16_t *input, int16_t *output) {
+static void iadst8(const int16_t *input, int16_t *output) {
int s0, s1, s2, s3, s4, s5, s6, s7;
int x0 = input[7];
@@ -389,10 +389,10 @@
}
static const transform_2d IHT_8[] = {
- { idct8_1d, idct8_1d }, // DCT_DCT = 0
- { iadst8_1d, idct8_1d }, // ADST_DCT = 1
- { idct8_1d, iadst8_1d }, // DCT_ADST = 2
- { iadst8_1d, iadst8_1d } // ADST_ADST = 3
+ { idct8, idct8 }, // DCT_DCT = 0
+ { iadst8, idct8 }, // ADST_DCT = 1
+ { idct8, iadst8 }, // DCT_ADST = 2
+ { iadst8, iadst8 } // ADST_ADST = 3
};
void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
@@ -430,7 +430,7 @@
// First transform rows
// only first 4 row has non-zero coefs
for (i = 0; i < 4; ++i) {
- idct8_1d(input, outptr);
+ idct8(input, outptr);
input += 8;
outptr += 8;
}
@@ -439,14 +439,14 @@
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j)
temp_in[j] = out[j * 8 + i];
- idct8_1d(temp_in, temp_out);
+ idct8(temp_in, temp_out);
for (j = 0; j < 8; ++j)
dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ dest[j * stride + i]);
}
}
-static void idct16_1d(const int16_t *input, int16_t *output) {
+static void idct16(const int16_t *input, int16_t *output) {
int16_t step1[16], step2[16];
int temp1, temp2;
@@ -619,7 +619,7 @@
// First transform rows
for (i = 0; i < 16; ++i) {
- idct16_1d(input, outptr);
+ idct16(input, outptr);
input += 16;
outptr += 16;
}
@@ -628,14 +628,14 @@
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j)
temp_in[j] = out[j * 16 + i];
- idct16_1d(temp_in, temp_out);
+ idct16(temp_in, temp_out);
for (j = 0; j < 16; ++j)
dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ dest[j * stride + i]);
}
}
-static void iadst16_1d(const int16_t *input, int16_t *output) {
+static void iadst16(const int16_t *input, int16_t *output) {
int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
int x0 = input[15];
@@ -807,10 +807,10 @@
}
static const transform_2d IHT_16[] = {
- { idct16_1d, idct16_1d }, // DCT_DCT = 0
- { iadst16_1d, idct16_1d }, // ADST_DCT = 1
- { idct16_1d, iadst16_1d }, // DCT_ADST = 2
- { iadst16_1d, iadst16_1d } // ADST_ADST = 3
+ { idct16, idct16 }, // DCT_DCT = 0
+ { iadst16, idct16 }, // ADST_DCT = 1
+ { idct16, iadst16 }, // DCT_ADST = 2
+ { iadst16, iadst16 } // ADST_ADST = 3
};
void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,
@@ -848,7 +848,7 @@
// First transform rows. Since all non-zero dct coefficients are in
// upper-left 4x4 area, we only need to calculate first 4 rows here.
for (i = 0; i < 4; ++i) {
- idct16_1d(input, outptr);
+ idct16(input, outptr);
input += 16;
outptr += 16;
}
@@ -857,7 +857,7 @@
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j)
temp_in[j] = out[j*16 + i];
- idct16_1d(temp_in, temp_out);
+ idct16(temp_in, temp_out);
for (j = 0; j < 16; ++j)
dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ dest[j * stride + i]);
@@ -877,7 +877,7 @@
}
}
-static void idct32_1d(const int16_t *input, int16_t *output) {
+static void idct32(const int16_t *input, int16_t *output) {
int16_t step1[32], step2[32];
int temp1, temp2;
@@ -1263,7 +1263,7 @@
zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
if (zero_coeff[0] | zero_coeff[1])
- idct32_1d(input, outptr);
+ idct32(input, outptr);
else
vpx_memset(outptr, 0, sizeof(int16_t) * 32);
input += 32;
@@ -1274,7 +1274,7 @@
for (i = 0; i < 32; ++i) {
for (j = 0; j < 32; ++j)
temp_in[j] = out[j * 32 + i];
- idct32_1d(temp_in, temp_out);
+ idct32(temp_in, temp_out);
for (j = 0; j < 32; ++j)
dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ dest[j * stride + i]);
@@ -1290,7 +1290,7 @@
// Rows
// only upper-left 8x8 has non-zero coeff
for (i = 0; i < 8; ++i) {
- idct32_1d(input, outptr);
+ idct32(input, outptr);
input += 32;
outptr += 32;
}
@@ -1299,7 +1299,7 @@
for (i = 0; i < 32; ++i) {
for (j = 0; j < 32; ++j)
temp_in[j] = out[j * 32 + i];
- idct32_1d(temp_in, temp_out);
+ idct32(temp_in, temp_out);
for (j = 0; j < 32; ++j)
dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ dest[j * stride + i]);
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index b5a9248..d554cc0 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -69,13 +69,11 @@
const int subpel_y,
const struct scale_factors *sf,
int w, int h, int ref,
- const struct subpix_fn_table *subpix,
+ const interp_kernel *kernel,
int xs, int ys) {
sf->predict[subpel_x != 0][subpel_y != 0][ref](
src, src_stride, dst, dst_stride,
- subpix->filter_x[subpel_x], xs,
- subpix->filter_y[subpel_y], ys,
- w, h);
+ kernel[subpel_x], xs, kernel[subpel_y], ys, w, h);
}
void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
@@ -83,7 +81,7 @@
const MV *src_mv,
const struct scale_factors *sf,
int w, int h, int ref,
- const struct subpix_fn_table *subpix,
+ const interp_kernel *kernel,
enum mv_precision precision,
int x, int y) {
const int is_q4 = precision == MV_PRECISION_Q4;
@@ -96,7 +94,7 @@
src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
- sf, w, h, ref, subpix, sf->x_step_q4, sf->y_step_q4);
+ sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4);
}
static INLINE int round_mv_comp_q4(int value) {
@@ -198,7 +196,8 @@
+ (scaled_mv.col >> SUBPEL_BITS);
inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
- subpel_x, subpel_y, sf, w, h, ref, &xd->subpix, xs, ys);
+ subpel_x, subpel_y, sf, w, h, ref, xd->interp_kernel,
+ xs, ys);
}
}
@@ -367,7 +366,7 @@
}
inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
- subpel_y, sf, w, h, ref, &xd->subpix, xs, ys);
+ subpel_y, sf, w, h, ref, xd->interp_kernel, xs, ys);
}
}
diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index 58f4b41..3345d83 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -18,7 +18,6 @@
extern "C" {
#endif
-struct subpix_fn_table;
void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
BLOCK_SIZE bsize);
@@ -36,7 +35,7 @@
const MV *mv_q3,
const struct scale_factors *sf,
int w, int h, int do_avg,
- const struct subpix_fn_table *subpix,
+ const interp_kernel *kernel,
enum mv_precision precision,
int x, int y);
@@ -94,11 +93,6 @@
}
}
-static void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd, int ref0, int ref1) {
- xd->block_refs[0] = &cm->frame_refs[ref0 >= 0 ? ref0 : 0];
- xd->block_refs[1] = &cm->frame_refs[ref1 >= 0 ? ref1 : 0];
-}
-
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 8fd6db8..66defb9 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -174,7 +174,7 @@
specialize vp9_v_predictor_32x32 $sse2_x86inc neon
prototype void vp9_tm_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_tm_predictor_32x32 $sse2_x86_64
+specialize vp9_tm_predictor_32x32 $sse2_x86_64 neon
prototype void vp9_dc_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_dc_predictor_32x32 $sse2_x86inc
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index 2f61494..13a5b5a 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -180,7 +180,7 @@
res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
}
-static void idct4_1d_sse2(__m128i *in) {
+static void idct4_sse2(__m128i *in) {
const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
@@ -216,7 +216,7 @@
in[1] = _mm_shuffle_epi32(in[1], 0x4E);
}
-static void iadst4_1d_sse2(__m128i *in) {
+static void iadst4_sse2(__m128i *in) {
const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
@@ -276,20 +276,20 @@
switch (tx_type) {
case 0: // DCT_DCT
- idct4_1d_sse2(in);
- idct4_1d_sse2(in);
+ idct4_sse2(in);
+ idct4_sse2(in);
break;
case 1: // ADST_DCT
- idct4_1d_sse2(in);
- iadst4_1d_sse2(in);
+ idct4_sse2(in);
+ iadst4_sse2(in);
break;
case 2: // DCT_ADST
- iadst4_1d_sse2(in);
- idct4_1d_sse2(in);
+ iadst4_sse2(in);
+ idct4_sse2(in);
break;
case 3: // ADST_ADST
- iadst4_1d_sse2(in);
- iadst4_1d_sse2(in);
+ iadst4_sse2(in);
+ iadst4_sse2(in);
break;
default:
assert(0);
@@ -455,7 +455,7 @@
res1 = _mm_packs_epi32(tmp2, tmp3); \
}
-#define IDCT8_1D(in0, in1, in2, in3, in4, in5, in6, in7, \
+#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3, out4, out5, out6, out7) \
{ \
/* Stage1 */ \
@@ -573,7 +573,7 @@
in0, in1, in2, in3, in4, in5, in6, in7);
// 4-stage 1D idct8x8
- IDCT8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+ IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
}
@@ -674,7 +674,7 @@
out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
}
-static void idct8_1d_sse2(__m128i *in) {
+static void idct8_sse2(__m128i *in) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
@@ -695,11 +695,11 @@
in0, in1, in2, in3, in4, in5, in6, in7);
// 4-stage 1D idct8x8
- IDCT8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+ IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
}
-static void iadst8_1d_sse2(__m128i *in) {
+static void iadst8_sse2(__m128i *in) {
const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
@@ -946,20 +946,20 @@
switch (tx_type) {
case 0: // DCT_DCT
- idct8_1d_sse2(in);
- idct8_1d_sse2(in);
+ idct8_sse2(in);
+ idct8_sse2(in);
break;
case 1: // ADST_DCT
- idct8_1d_sse2(in);
- iadst8_1d_sse2(in);
+ idct8_sse2(in);
+ iadst8_sse2(in);
break;
case 2: // DCT_ADST
- iadst8_1d_sse2(in);
- idct8_1d_sse2(in);
+ iadst8_sse2(in);
+ idct8_sse2(in);
break;
case 3: // ADST_ADST
- iadst8_1d_sse2(in);
- iadst8_1d_sse2(in);
+ iadst8_sse2(in);
+ iadst8_sse2(in);
break;
default:
assert(0);
@@ -1104,7 +1104,7 @@
TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
- IDCT8_1D(in0, in1, in2, in3, zero, zero, zero, zero,
+ IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
in0, in1, in2, in3, in4, in5, in6, in7);
// Final rounding and shift
in0 = _mm_adds_epi16(in0, final_rounding);
@@ -1135,7 +1135,7 @@
RECON_AND_STORE(dest, in7);
}
-#define IDCT16_1D \
+#define IDCT16 \
/* Stage2 */ \
{ \
const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
@@ -1264,7 +1264,7 @@
stp2_10, stp2_13, stp2_11, stp2_12) \
}
-#define IDCT16_10_1D \
+#define IDCT16_10 \
/* Stage2 */ \
{ \
const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
@@ -1437,7 +1437,7 @@
array_transpose_8x8(in, in);
array_transpose_8x8(in+8, in+8);
- IDCT16_1D
+ IDCT16
// Stage7
curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
@@ -1465,7 +1465,7 @@
array_transpose_8x8(l+i*8, in);
array_transpose_8x8(r+i*8, in+8);
- IDCT16_1D
+ IDCT16
// 2-D
in[0] = _mm_add_epi16(stp2_0, stp1_15);
@@ -1590,7 +1590,7 @@
res0[15] = tbuf[7];
}
-static void iadst16_1d_8col(__m128i *in) {
+static void iadst16_8col(__m128i *in) {
// perform 16x16 1-D ADST for 8 columns
__m128i s[16], x[16], u[32], v[32];
const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
@@ -2060,7 +2060,7 @@
in[15] = _mm_sub_epi16(kZero, s[1]);
}
-static void idct16_1d_8col(__m128i *in) {
+static void idct16_8col(__m128i *in) {
const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
@@ -2404,16 +2404,16 @@
in[15] = _mm_sub_epi16(s[0], s[15]);
}
-static void idct16_1d_sse2(__m128i *in0, __m128i *in1) {
+static void idct16_sse2(__m128i *in0, __m128i *in1) {
array_transpose_16x16(in0, in1);
- idct16_1d_8col(in0);
- idct16_1d_8col(in1);
+ idct16_8col(in0);
+ idct16_8col(in1);
}
-static void iadst16_1d_sse2(__m128i *in0, __m128i *in1) {
+static void iadst16_sse2(__m128i *in0, __m128i *in1) {
array_transpose_16x16(in0, in1);
- iadst16_1d_8col(in0);
- iadst16_1d_8col(in1);
+ iadst16_8col(in0);
+ iadst16_8col(in1);
}
static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
@@ -2502,20 +2502,20 @@
switch (tx_type) {
case 0: // DCT_DCT
- idct16_1d_sse2(in0, in1);
- idct16_1d_sse2(in0, in1);
+ idct16_sse2(in0, in1);
+ idct16_sse2(in0, in1);
break;
case 1: // ADST_DCT
- idct16_1d_sse2(in0, in1);
- iadst16_1d_sse2(in0, in1);
+ idct16_sse2(in0, in1);
+ iadst16_sse2(in0, in1);
break;
case 2: // DCT_ADST
- iadst16_1d_sse2(in0, in1);
- idct16_1d_sse2(in0, in1);
+ iadst16_sse2(in0, in1);
+ idct16_sse2(in0, in1);
break;
case 3: // ADST_ADST
- iadst16_1d_sse2(in0, in1);
- iadst16_1d_sse2(in0, in1);
+ iadst16_sse2(in0, in1);
+ iadst16_sse2(in0, in1);
break;
default:
assert(0);
@@ -2732,7 +2732,7 @@
for (i = 0; i < 2; i++) {
array_transpose_4X8(l + 8*i, in);
- IDCT16_10_1D
+ IDCT16_10
// Stage7
in[0] = _mm_add_epi16(stp2_0, stp1_15);
@@ -2814,7 +2814,7 @@
input += 8; \
} \
-#define IDCT32_1D_34 \
+#define IDCT32_34 \
/* Stage1 */ \
{ \
const __m128i zero = _mm_setzero_si128();\
@@ -3115,7 +3115,7 @@
}
-#define IDCT32_1D \
+#define IDCT32 \
/* Stage1 */ \
{ \
const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
@@ -3554,7 +3554,7 @@
array_transpose_8x8(in+16, in+16);
array_transpose_8x8(in+24, in+24);
- IDCT32_1D
+ IDCT32
// 1_D: Store 32 intermediate results for each 8x32 block.
col[0] = _mm_add_epi16(stp1_0, stp1_31);
@@ -3593,7 +3593,7 @@
const __m128i zero = _mm_setzero_si128();
// Transpose 32x8 block to 8x32 block
array_transpose_8x8(col+i*8, in);
- IDCT32_1D_34
+ IDCT32_34
// 2_D: Calculate the results and store them to destination.
in[0] = _mm_add_epi16(stp1_0, stp1_31);
@@ -3922,7 +3922,7 @@
array_transpose_8x8(in+16, in+16);
array_transpose_8x8(in+24, in+24);
- IDCT32_1D
+ IDCT32
// 1_D: Store 32 intermediate results for each 8x32 block.
col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
@@ -3969,7 +3969,7 @@
array_transpose_8x8(col+j+64, in+16);
array_transpose_8x8(col+j+96, in+24);
- IDCT32_1D
+ IDCT32
// 2_D: Calculate the results and store them to destination.
in[0] = _mm_add_epi16(stp1_0, stp1_31);
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 466b180..1d9be53 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -421,8 +421,7 @@
if (has_second_ref(mbmi))
set_ref(cm, xd, 1, mi_row, mi_col);
- xd->subpix.filter_x = xd->subpix.filter_y =
- vp9_get_interp_kernel(mbmi->interp_filter);
+ xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
// Prediction
vp9_dec_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 0f4a6bb..a840b48 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -997,7 +997,7 @@
return rv;
}
-static void dct32_1d(const int *input, int *output, int round) {
+static void fdct32(const int *input, int *output, int round) {
int step[32];
// Stage 1
step[0] = input[0] + input[(32 - 1)];
@@ -1329,7 +1329,7 @@
int temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j)
temp_in[j] = input[j * stride + i] * 4;
- dct32_1d(temp_in, temp_out, 0);
+ fdct32(temp_in, temp_out, 0);
for (j = 0; j < 32; ++j)
output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
}
@@ -1339,13 +1339,13 @@
int temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j)
temp_in[j] = output[j + i * 32];
- dct32_1d(temp_in, temp_out, 0);
+ fdct32(temp_in, temp_out, 0);
for (j = 0; j < 32; ++j)
out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
}
}
-// Note that although we use dct_32_round in dct32_1d computation flow,
+// Note that although we use dct_32_round in dct32 computation flow,
// this 2d fdct32x32 for rate-distortion optimization loop is operating
// within 16 bits precision.
void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
@@ -1357,7 +1357,7 @@
int temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j)
temp_in[j] = input[j * stride + i] * 4;
- dct32_1d(temp_in, temp_out, 0);
+ fdct32(temp_in, temp_out, 0);
for (j = 0; j < 32; ++j)
// TODO(cd): see quality impact of only doing
// output[j * 32 + i] = (temp_out[j] + 1) >> 2;
@@ -1370,7 +1370,7 @@
int temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j)
temp_in[j] = output[j + i * 32];
- dct32_1d(temp_in, temp_out, 1);
+ fdct32(temp_in, temp_out, 1);
for (j = 0; j < 32; ++j)
out[j + i * 32] = temp_out[j];
}
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 317ac98..e5d4583 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -2681,6 +2681,7 @@
vp9_update_zbin_extra(cpi, x);
}
} else {
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index d148530..3b641a1 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -27,20 +27,9 @@
void vp9_setup_interp_filters(MACROBLOCKD *xd, INTERP_FILTER filter,
VP9_COMMON *cm) {
- if (xd->mi_8x8 && xd->mi_8x8[0]) {
- MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
-
- set_ref_ptrs(cm, xd, mbmi->ref_frame[0] - LAST_FRAME,
- mbmi->ref_frame[1] - LAST_FRAME);
-
- } else {
- set_ref_ptrs(cm, xd, -1, -1);
- }
-
- xd->subpix.filter_x = xd->subpix.filter_y =
- vp9_get_interp_kernel(filter == SWITCHABLE ? EIGHTTAP : filter);
-
- assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0);
+ xd->interp_kernel = vp9_get_interp_kernel(filter == SWITCHABLE ? EIGHTTAP
+ : filter);
+ assert(((intptr_t)xd->interp_kernel & 0xff) == 0);
}
void vp9_subtract_block_c(int rows, int cols,
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 395ce20..28b343c 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -447,6 +447,16 @@
}
}
+static BLOCK_SIZE get_bsize(const VP9_COMMON *cm, int mb_row, int mb_col) {
+ if (2 * mb_col + 1 < cm->mi_cols) {
+ return 2 * mb_row + 1 < cm->mi_rows ? BLOCK_16X16
+ : BLOCK_16X8;
+ } else {
+ return 2 * mb_row + 1 < cm->mi_rows ? BLOCK_8X16
+ : BLOCK_8X8;
+ }
+}
+
void vp9_first_pass(VP9_COMP *cpi) {
int mb_row, mb_col;
MACROBLOCK *const x = &cpi->mb;
@@ -542,6 +552,7 @@
int this_error;
int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
double error_weight = 1.0;
+ const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
vp9_clear_system_state(); // __asm emms;
@@ -549,30 +560,15 @@
xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
xd->left_available = (mb_col != 0);
-
- if (mb_col * 2 + 1 < cm->mi_cols) {
- if (mb_row * 2 + 1 < cm->mi_rows) {
- xd->mi_8x8[0]->mbmi.sb_type = BLOCK_16X16;
- } else {
- xd->mi_8x8[0]->mbmi.sb_type = BLOCK_16X8;
- }
- } else {
- if (mb_row * 2 + 1 < cm->mi_rows) {
- xd->mi_8x8[0]->mbmi.sb_type = BLOCK_8X16;
- } else {
- xd->mi_8x8[0]->mbmi.sb_type = BLOCK_8X8;
- }
- }
+ xd->mi_8x8[0]->mbmi.sb_type = bsize;
xd->mi_8x8[0]->mbmi.ref_frame[0] = INTRA_FRAME;
set_mi_row_col(xd, &tile,
- mb_row << 1,
- num_8x8_blocks_high_lookup[xd->mi_8x8[0]->mbmi.sb_type],
- mb_col << 1,
- num_8x8_blocks_wide_lookup[xd->mi_8x8[0]->mbmi.sb_type],
+ mb_row << 1, num_8x8_blocks_high_lookup[bsize],
+ mb_col << 1, num_8x8_blocks_wide_lookup[bsize],
cm->mi_rows, cm->mi_cols);
if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
- int energy = vp9_block_energy(cpi, x, xd->mi_8x8[0]->mbmi.sb_type);
+ int energy = vp9_block_energy(cpi, x, bsize);
error_weight = vp9_vaq_inv_q_ratio(energy);
}
@@ -692,9 +688,8 @@
xd->mi_8x8[0]->mbmi.tx_size = TX_4X4;
xd->mi_8x8[0]->mbmi.ref_frame[0] = LAST_FRAME;
xd->mi_8x8[0]->mbmi.ref_frame[1] = NONE;
- vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1,
- xd->mi_8x8[0]->mbmi.sb_type);
- vp9_encode_sby(x, xd->mi_8x8[0]->mbmi.sb_type);
+ vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
+ vp9_encode_sby(x, bsize);
sum_mvr += mv.as_mv.row;
sum_mvr_abs += abs(mv.as_mv.row);
sum_mvc += mv.as_mv.col;
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 1852d1e..8f06af1 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -839,6 +839,7 @@
if (speed >= 5) {
int i;
sf->disable_split_mask = DISABLE_ALL_SPLIT;
+ sf->subpel_force_stop = 1;
for (i = 0; i < TX_SIZES; i++) {
sf->intra_y_mode_mask[i] = INTRA_DC_H_V;
sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
@@ -867,6 +868,7 @@
sf->recode_loop = 1;
sf->subpel_search_method = SUBPEL_TREE;
sf->subpel_iters_per_step = 2;
+ sf->subpel_force_stop = 0;
sf->optimize_coefficients = !cpi->oxcf.lossless;
sf->reduce_first_step_size = 0;
sf->auto_mv_step_size = 0;
@@ -3395,6 +3397,7 @@
int64_t *time_stamp, int64_t *time_end, int flush) {
VP9_COMP *cpi = (VP9_COMP *) ptr;
VP9_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &cpi->mb.e_mbd;
struct vpx_usec_timer cmptimer;
YV12_BUFFER_CONFIG *force_src_buffer = NULL;
MV_REFERENCE_FRAME ref_frame;
@@ -3578,7 +3581,8 @@
vp9_extend_frame_borders(buf, cm->subsampling_x, cm->subsampling_y);
}
- vp9_setup_interp_filters(&cpi->mb.e_mbd, DEFAULT_INTERP_FILTER, cm);
+ set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
+ vp9_setup_interp_filters(xd, DEFAULT_INTERP_FILTER, cm);
if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
vp9_vaq_init();
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 9cf3f62..7a12f12 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -256,6 +256,9 @@
// Maximum number of steps in logarithmic subpel search before giving up.
int subpel_iters_per_step;
+ // Control when to stop subpel search
+ int subpel_force_stop;
+
// Thresh_mult is used to set a threshold for the rd score. A higher value
// means that we will accept the best mode so far more often. This number
// is used in combination with the current block size, and thresh_freq_fact
@@ -821,6 +824,12 @@
return mb_rows * mb_cols * (48 * 16 + 4);
}
+static void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd,
+ MV_REFERENCE_FRAME ref0, MV_REFERENCE_FRAME ref1) {
+ xd->block_refs[0] = &cm->frame_refs[ref0 - LAST_FRAME];
+ xd->block_refs[1] = &cm->frame_refs[ref1 - LAST_FRAME];
+}
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index 9d73df2..0c0a20f 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -34,6 +34,22 @@
void vp9_set_alt_lf_level(VP9_COMP *cpi, int filt_val) {
}
+static int try_filter_frame(const YV12_BUFFER_CONFIG *sd, VP9_COMP *const cpi,
+ MACROBLOCKD *const xd, VP9_COMMON *const cm,
+ int filt_level, int partial) {
+ int filt_err;
+
+ vp9_set_alt_lf_level(cpi, filt_level);
+ vp9_loop_filter_frame(cm, xd, filt_level, 1, partial);
+
+ filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
+
+ // Re-instate the unfiltered frame
+ vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+
+ return filt_err;
+}
+
static void search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
int partial) {
MACROBLOCKD *const xd = &cpi->mb.e_mbd;
@@ -41,31 +57,30 @@
struct loopfilter *const lf = &cm->lf;
const int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
const int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
- int best_err = 0;
- int filt_err = 0;
+ int best_err;
int filt_best;
int filt_direction = 0;
// Start the search at the previous frame filter level unless it is now out of
// range.
int filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level);
int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
+ // Sum squared error at each filter level
+ int ss_err[MAX_LOOP_FILTER + 1];
+
+ // Set each entry to -1
+ vpx_memset(ss_err, 0xFF, sizeof(ss_err));
// Make a copy of the unfiltered / processed recon buffer
vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
- // Get baseline error score
- vp9_set_alt_lf_level(cpi, filt_mid);
- vp9_loop_filter_frame(cm, xd, filt_mid, 1, partial);
-
- best_err = vp9_calc_ss_err(sd, cm->frame_to_show);
+ best_err = try_filter_frame(sd, cpi, xd, cm, filt_mid, partial);
filt_best = filt_mid;
-
- // Re-instate the unfiltered frame
- vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+ ss_err[filt_mid] = best_err;
while (filter_step > 0) {
const int filt_high = MIN(filt_mid + filter_step, max_filter_level);
const int filt_low = MAX(filt_mid - filter_step, min_filter_level);
+ int filt_err;
// Bias against raising loop filter in favor of lowering it.
int bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
@@ -79,14 +94,12 @@
if (filt_direction <= 0 && filt_low != filt_mid) {
// Get Low filter error score
- vp9_set_alt_lf_level(cpi, filt_low);
- vp9_loop_filter_frame(cm, xd, filt_low, 1, partial);
-
- filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
-
- // Re-instate the unfiltered frame
- vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
-
+ if (ss_err[filt_low] < 0) {
+ filt_err = try_filter_frame(sd, cpi, xd, cm, filt_low, partial);
+ ss_err[filt_low] = filt_err;
+ } else {
+ filt_err = ss_err[filt_low];
+ }
// If value is close to the best so far then bias towards a lower loop
// filter value.
if ((filt_err - bias) < best_err) {
@@ -100,14 +113,12 @@
// Now look at filt_high
if (filt_direction >= 0 && filt_high != filt_mid) {
- vp9_set_alt_lf_level(cpi, filt_high);
- vp9_loop_filter_frame(cm, xd, filt_high, 1, partial);
-
- filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
-
- // Re-instate the unfiltered frame
- vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
-
+ if (ss_err[filt_high] < 0) {
+ filt_err = try_filter_frame(sd, cpi, xd, cm, filt_high, partial);
+ ss_err[filt_high] = filt_err;
+ } else {
+ filt_err = ss_err[filt_high];
+ }
// Was it better than the previous best?
if (filt_err < (best_err - bias)) {
best_err = filt_err;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 3be79f4..da4cd3a 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1517,8 +1517,8 @@
vp9_build_inter_predictor(pre, pd->pre[ref].stride,
dst, pd->dst.stride,
&mi->bmi[i].as_mv[ref].as_mv,
- &xd->block_refs[ref]->sf,
- width, height, ref, &xd->subpix, MV_PRECISION_Q3,
+ &xd->block_refs[ref]->sf, width, height, ref,
+ xd->interp_kernel, MV_PRECISION_Q3,
mi_col * MI_SIZE + 4 * (i % 2),
mi_row * MI_SIZE + 4 * (i / 2));
}
@@ -1840,7 +1840,8 @@
&bsi->ref_mv->as_mv,
cm->allow_high_precision_mv,
x->errorperbit, v_fn_ptr,
- 0, cpi->sf.subpel_iters_per_step,
+ cpi->sf.subpel_force_stop,
+ cpi->sf.subpel_iters_per_step,
x->nmvjointcost, x->mvcost,
&distortion,
&x->pred_sse[mbmi->ref_frame[0]]);
@@ -2451,7 +2452,8 @@
cm->allow_high_precision_mv,
x->errorperbit,
&cpi->fn_ptr[bsize],
- 0, cpi->sf.subpel_iters_per_step,
+ cpi->sf.subpel_force_stop,
+ cpi->sf.subpel_iters_per_step,
x->nmvjointcost, x->mvcost,
&dis, &x->pred_sse[ref]);
}
@@ -2536,7 +2538,7 @@
&frame_mv[refs[!id]].as_mv,
&xd->block_refs[!id]->sf,
pw, ph, 0,
- &xd->subpix, MV_PRECISION_Q3,
+ xd->interp_kernel, MV_PRECISION_Q3,
mi_col * MI_SIZE, mi_row * MI_SIZE);
// Compound motion search on first ref frame.
@@ -3277,7 +3279,7 @@
continue;
}
- set_ref_ptrs(cm, xd, ref_frame - 1, second_ref_frame - 1);
+ set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
mbmi->uv_mode = DC_PRED;
// Evaluate all sub-pel filters irrespective of whether we can use
@@ -3709,7 +3711,7 @@
vp9_zero(best_tx_diff);
}
- set_ref_ptrs(cm, xd, mbmi->ref_frame[0] - 1, mbmi->ref_frame[1] - 1);
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
store_coding_context(x, ctx, best_mode_index,
&mbmi->ref_mvs[mbmi->ref_frame[0]][0],
&mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
@@ -3902,7 +3904,7 @@
vp9_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf))
continue;
- set_ref_ptrs(cm, xd, ref_frame - 1, second_ref_frame - 1);
+ set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
mbmi->uv_mode = DC_PRED;
// Evaluate all sub-pel filters irrespective of whether we can use
@@ -4442,7 +4444,7 @@
vp9_zero(best_tx_diff);
}
- set_ref_ptrs(cm, xd, mbmi->ref_frame[0] - 1, mbmi->ref_frame[1] - 1);
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
store_coding_context(x, ctx, best_mode_index,
&mbmi->ref_mvs[mbmi->ref_frame[0]][0],
&mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
diff --git a/vp9/encoder/vp9_resize.c b/vp9/encoder/vp9_resize.c
index f15abc0..0766b51 100644
--- a/vp9/encoder/vp9_resize.c
+++ b/vp9/encoder/vp9_resize.c
@@ -16,7 +16,6 @@
#include <string.h>
#include "vp9/common/vp9_common.h"
#include "vp9/encoder/vp9_resize.h"
-#include "vpx/vpx_integer.h"
#define FILTER_BITS 7
@@ -30,8 +29,44 @@
typedef int16_t interp_kernel[INTERP_TAPS];
-// Filters for interpolation - note this also filters integer pels.
-const interp_kernel vp9_filteredinterp_filters[(1 << SUBPEL_BITS)] = {
+// Filters for interpolation (0.5-band) - note this also filters integer pels.
+const interp_kernel vp9_filteredinterp_filters500[(1 << SUBPEL_BITS)] = {
+ {-3, 0, 35, 64, 35, 0, -3, 0},
+ {-3, -1, 34, 64, 36, 1, -3, 0},
+ {-3, -1, 32, 64, 38, 1, -3, 0},
+ {-2, -2, 31, 63, 39, 2, -3, 0},
+ {-2, -2, 29, 63, 41, 2, -3, 0},
+ {-2, -2, 28, 63, 42, 3, -4, 0},
+ {-2, -3, 27, 63, 43, 4, -4, 0},
+ {-2, -3, 25, 62, 45, 5, -4, 0},
+ {-2, -3, 24, 62, 46, 5, -4, 0},
+ {-2, -3, 23, 61, 47, 6, -4, 0},
+ {-2, -3, 21, 60, 49, 7, -4, 0},
+ {-1, -4, 20, 60, 50, 8, -4, -1},
+ {-1, -4, 19, 59, 51, 9, -4, -1},
+ {-1, -4, 17, 58, 52, 10, -4, 0},
+ {-1, -4, 16, 57, 53, 12, -4, -1},
+ {-1, -4, 15, 56, 54, 13, -4, -1},
+ {-1, -4, 14, 55, 55, 14, -4, -1},
+ {-1, -4, 13, 54, 56, 15, -4, -1},
+ {-1, -4, 12, 53, 57, 16, -4, -1},
+ {0, -4, 10, 52, 58, 17, -4, -1},
+ {-1, -4, 9, 51, 59, 19, -4, -1},
+ {-1, -4, 8, 50, 60, 20, -4, -1},
+ {0, -4, 7, 49, 60, 21, -3, -2},
+ {0, -4, 6, 47, 61, 23, -3, -2},
+ {0, -4, 5, 46, 62, 24, -3, -2},
+ {0, -4, 5, 45, 62, 25, -3, -2},
+ {0, -4, 4, 43, 63, 27, -3, -2},
+ {0, -4, 3, 42, 63, 28, -2, -2},
+ {0, -3, 2, 41, 63, 29, -2, -2},
+ {0, -3, 2, 39, 63, 31, -2, -2},
+ {0, -3, 1, 38, 64, 32, -1, -3},
+ {0, -3, 1, 36, 64, 34, -1, -3}
+};
+
+// Filters for interpolation (0.625-band) - note this also filters integer pels.
+const interp_kernel vp9_filteredinterp_filters625[(1 << SUBPEL_BITS)] = {
{-1, -8, 33, 80, 33, -8, -1, 0},
{-1, -8, 30, 80, 35, -8, -1, 1},
{-1, -8, 28, 80, 37, -7, -2, 1},
@@ -66,10 +101,132 @@
{1, -1, -8, 35, 80, 30, -8, -1},
};
+// Filters for interpolation (0.75-band) - note this also filters integer pels.
+const interp_kernel vp9_filteredinterp_filters750[(1 << SUBPEL_BITS)] = {
+ {2, -11, 25, 96, 25, -11, 2, 0},
+ {2, -11, 22, 96, 28, -11, 2, 0},
+ {2, -10, 19, 95, 31, -11, 2, 0},
+ {2, -10, 17, 95, 34, -12, 2, 0},
+ {2, -9, 14, 94, 37, -12, 2, 0},
+ {2, -8, 12, 93, 40, -12, 1, 0},
+ {2, -8, 9, 92, 43, -12, 1, 1},
+ {2, -7, 7, 91, 46, -12, 1, 0},
+ {2, -7, 5, 90, 49, -12, 1, 0},
+ {2, -6, 3, 88, 52, -12, 0, 1},
+ {2, -5, 1, 86, 55, -12, 0, 1},
+ {2, -5, -1, 84, 58, -11, 0, 1},
+ {2, -4, -2, 82, 61, -11, -1, 1},
+ {2, -4, -4, 80, 64, -10, -1, 1},
+ {1, -3, -5, 77, 67, -9, -1, 1},
+ {1, -3, -6, 75, 70, -8, -2, 1},
+ {1, -2, -7, 72, 72, -7, -2, 1},
+ {1, -2, -8, 70, 75, -6, -3, 1},
+ {1, -1, -9, 67, 77, -5, -3, 1},
+ {1, -1, -10, 64, 80, -4, -4, 2},
+ {1, -1, -11, 61, 82, -2, -4, 2},
+ {1, 0, -11, 58, 84, -1, -5, 2},
+ {1, 0, -12, 55, 86, 1, -5, 2},
+ {1, 0, -12, 52, 88, 3, -6, 2},
+ {0, 1, -12, 49, 90, 5, -7, 2},
+ {0, 1, -12, 46, 91, 7, -7, 2},
+ {1, 1, -12, 43, 92, 9, -8, 2},
+ {0, 1, -12, 40, 93, 12, -8, 2},
+ {0, 2, -12, 37, 94, 14, -9, 2},
+ {0, 2, -12, 34, 95, 17, -10, 2},
+ {0, 2, -11, 31, 95, 19, -10, 2},
+ {0, 2, -11, 28, 96, 22, -11, 2}
+};
+
+// Filters for interpolation (0.875-band) - note this also filters integer pels.
+const interp_kernel vp9_filteredinterp_filters875[(1 << SUBPEL_BITS)] = {
+ {3, -8, 13, 112, 13, -8, 3, 0},
+ {3, -7, 10, 112, 17, -9, 3, -1},
+ {2, -6, 7, 111, 21, -9, 3, -1},
+ {2, -5, 4, 111, 24, -10, 3, -1},
+ {2, -4, 1, 110, 28, -11, 3, -1},
+ {1, -3, -1, 108, 32, -12, 4, -1},
+ {1, -2, -3, 106, 36, -13, 4, -1},
+ {1, -1, -6, 105, 40, -14, 4, -1},
+ {1, -1, -7, 102, 44, -14, 4, -1},
+ {1, 0, -9, 100, 48, -15, 4, -1},
+ {1, 1, -11, 97, 53, -16, 4, -1},
+ {0, 1, -12, 95, 57, -16, 4, -1},
+ {0, 2, -13, 91, 61, -16, 4, -1},
+ {0, 2, -14, 88, 65, -16, 4, -1},
+ {0, 3, -15, 84, 69, -17, 4, 0},
+ {0, 3, -16, 81, 73, -16, 3, 0},
+ {0, 3, -16, 77, 77, -16, 3, 0},
+ {0, 3, -16, 73, 81, -16, 3, 0},
+ {0, 4, -17, 69, 84, -15, 3, 0},
+ {-1, 4, -16, 65, 88, -14, 2, 0},
+ {-1, 4, -16, 61, 91, -13, 2, 0},
+ {-1, 4, -16, 57, 95, -12, 1, 0},
+ {-1, 4, -16, 53, 97, -11, 1, 1},
+ {-1, 4, -15, 48, 100, -9, 0, 1},
+ {-1, 4, -14, 44, 102, -7, -1, 1},
+ {-1, 4, -14, 40, 105, -6, -1, 1},
+ {-1, 4, -13, 36, 106, -3, -2, 1},
+ {-1, 4, -12, 32, 108, -1, -3, 1},
+ {-1, 3, -11, 28, 110, 1, -4, 2},
+ {-1, 3, -10, 24, 111, 4, -5, 2},
+ {-1, 3, -9, 21, 111, 7, -6, 2},
+ {-1, 3, -9, 17, 112, 10, -7, 3}
+};
+
+// Filters for interpolation (full-band) - no filtering for integer pixels
+const interp_kernel vp9_filteredinterp_filters1000[(1 << SUBPEL_BITS)] = {
+ {0, 0, 0, 128, 0, 0, 0, 0},
+ {0, 1, -3, 128, 3, -1, 0, 0},
+ {-1, 2, -6, 127, 7, -2, 1, 0},
+ {-1, 3, -9, 126, 12, -4, 1, 0},
+ {-1, 4, -12, 125, 16, -5, 1, 0},
+ {-1, 4, -14, 123, 20, -6, 2, 0},
+ {-1, 5, -15, 120, 25, -8, 2, 0},
+ {-1, 5, -17, 118, 30, -9, 3, -1},
+ {-1, 6, -18, 114, 35, -10, 3, -1},
+ {-1, 6, -19, 111, 41, -12, 3, -1},
+ {-1, 6, -20, 107, 46, -13, 4, -1},
+ {-1, 6, -21, 103, 52, -14, 4, -1},
+ {-1, 6, -21, 99, 57, -16, 5, -1},
+ {-1, 6, -21, 94, 63, -17, 5, -1},
+ {-1, 6, -20, 89, 68, -18, 5, -1},
+ {-1, 6, -20, 84, 73, -19, 6, -1},
+ {-1, 6, -20, 79, 79, -20, 6, -1},
+ {-1, 6, -19, 73, 84, -20, 6, -1},
+ {-1, 5, -18, 68, 89, -20, 6, -1},
+ {-1, 5, -17, 63, 94, -21, 6, -1},
+ {-1, 5, -16, 57, 99, -21, 6, -1},
+ {-1, 4, -14, 52, 103, -21, 6, -1},
+ {-1, 4, -13, 46, 107, -20, 6, -1},
+ {-1, 3, -12, 41, 111, -19, 6, -1},
+ {-1, 3, -10, 35, 114, -18, 6, -1},
+ {-1, 3, -9, 30, 118, -17, 5, -1},
+ {0, 2, -8, 25, 120, -15, 5, -1},
+ {0, 2, -6, 20, 123, -14, 4, -1},
+ {0, 1, -5, 16, 125, -12, 4, -1},
+ {0, 1, -4, 12, 126, -9, 3, -1},
+ {0, 1, -2, 7, 127, -6, 2, -1},
+ {0, 0, -1, 3, 128, -3, 1, 0}
+};
+
// Filters for factor of 2 downsampling.
static const int16_t vp9_down2_symeven_half_filter[] = {56, 12, -3, -1};
static const int16_t vp9_down2_symodd_half_filter[] = {64, 35, 0, -3};
+static const interp_kernel *choose_interp_filter(int inlength, int outlength) {
+ int outlength16 = outlength * 16;
+ if (outlength16 >= inlength * 16)
+ return vp9_filteredinterp_filters1000;
+ else if (outlength16 >= inlength * 13)
+ return vp9_filteredinterp_filters875;
+ else if (outlength16 >= inlength * 11)
+ return vp9_filteredinterp_filters750;
+ else if (outlength16 >= inlength * 9)
+ return vp9_filteredinterp_filters625;
+ else
+ return vp9_filteredinterp_filters500;
+}
+
static void interpolate(const uint8_t *const input, int inlength,
uint8_t *output, int outlength) {
const int64_t delta = (((uint64_t)inlength << 32) + outlength / 2) /
@@ -81,6 +238,9 @@
int x, x1, x2, sum, k, int_pel, sub_pel;
int64_t y;
+ const interp_kernel *interp_filters =
+ choose_interp_filter(inlength, outlength);
+
x = 0;
y = offset;
while ((y >> INTERP_PRECISION_BITS) < (INTERP_TAPS / 2 - 1)) {
@@ -101,7 +261,7 @@
const int16_t *filter;
int_pel = y >> INTERP_PRECISION_BITS;
sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
- filter = vp9_filteredinterp_filters[sub_pel];
+ filter = interp_filters[sub_pel];
sum = 0;
for (k = 0; k < INTERP_TAPS; ++k) {
const int pk = int_pel - INTERP_TAPS / 2 + 1 + k;
@@ -116,7 +276,7 @@
const int16_t *filter;
int_pel = y >> INTERP_PRECISION_BITS;
sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
- filter = vp9_filteredinterp_filters[sub_pel];
+ filter = interp_filters[sub_pel];
sum = 0;
for (k = 0; k < INTERP_TAPS; ++k)
sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k < 0 ?
@@ -129,7 +289,7 @@
const int16_t *filter;
int_pel = y >> INTERP_PRECISION_BITS;
sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
- filter = vp9_filteredinterp_filters[sub_pel];
+ filter = interp_filters[sub_pel];
sum = 0;
for (k = 0; k < INTERP_TAPS; ++k)
sum += filter[k] * input[int_pel - INTERP_TAPS / 2 + 1 + k];
@@ -140,7 +300,7 @@
const int16_t *filter;
int_pel = y >> INTERP_PRECISION_BITS;
sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
- filter = vp9_filteredinterp_filters[sub_pel];
+ filter = interp_filters[sub_pel];
sum = 0;
for (k = 0; k < INTERP_TAPS; ++k)
sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k >=
diff --git a/vp9/encoder/vp9_resize.h b/vp9/encoder/vp9_resize.h
index c67595a..1818cd4 100644
--- a/vp9/encoder/vp9_resize.h
+++ b/vp9/encoder/vp9_resize.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -12,6 +12,7 @@
#define VP9_ENCODER_VP9_RESIZE_H_
#include <stdio.h>
+#include "vpx/vpx_integer.h"
void vp9_resize_plane(const uint8_t *const input,
int height,
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index c2eea0a..c9a4246 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -60,7 +60,7 @@
scale,
16, 16,
which_mv,
- &xd->subpix, MV_PRECISION_Q3, x, y);
+ xd->interp_kernel, MV_PRECISION_Q3, x, y);
vp9_build_inter_predictor(u_mb_ptr, uv_stride,
&pred[256], uv_block_size,
@@ -68,7 +68,7 @@
scale,
uv_block_size, uv_block_size,
which_mv,
- &xd->subpix, mv_precision_uv, x, y);
+ xd->interp_kernel, mv_precision_uv, x, y);
vp9_build_inter_predictor(v_mb_ptr, uv_stride,
&pred[512], uv_block_size,
@@ -76,7 +76,7 @@
scale,
uv_block_size, uv_block_size,
which_mv,
- &xd->subpix, mv_precision_uv, x, y);
+ xd->interp_kernel, mv_precision_uv, x, y);
}
void vp9_temporal_filter_apply_c(uint8_t *frame1,
diff --git a/vp9/encoder/x86/vp9_dct_avx2.c b/vp9/encoder/x86/vp9_dct_avx2.c
index d81b72b..ea031fb 100644
--- a/vp9/encoder/x86/vp9_dct_avx2.c
+++ b/vp9/encoder/x86/vp9_dct_avx2.c
@@ -163,7 +163,7 @@
res[3] = _mm_unpackhi_epi64(res[2], res[2]);
}
-void fdct4_1d_avx2(__m128i *in) {
+void fdct4_avx2(__m128i *in) {
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
@@ -196,7 +196,7 @@
transpose_4x4_avx2(in);
}
-void fadst4_1d_avx2(__m128i *in) {
+void fadst4_avx2(__m128i *in) {
const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
@@ -250,20 +250,20 @@
load_buffer_4x4_avx2(input, in, stride);
switch (tx_type) {
case 0: // DCT_DCT
- fdct4_1d_avx2(in);
- fdct4_1d_avx2(in);
+ fdct4_avx2(in);
+ fdct4_avx2(in);
break;
case 1: // ADST_DCT
- fadst4_1d_avx2(in);
- fdct4_1d_avx2(in);
+ fadst4_avx2(in);
+ fdct4_avx2(in);
break;
case 2: // DCT_ADST
- fdct4_1d_avx2(in);
- fadst4_1d_avx2(in);
+ fdct4_avx2(in);
+ fadst4_avx2(in);
break;
case 3: // ADST_ADST
- fadst4_1d_avx2(in);
- fadst4_1d_avx2(in);
+ fadst4_avx2(in);
+ fadst4_avx2(in);
break;
default:
assert(0);
@@ -658,7 +658,7 @@
// 07 17 27 37 47 57 67 77
}
-void fdct8_1d_avx2(__m128i *in) {
+void fdct8_avx2(__m128i *in) {
// constants
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
@@ -798,7 +798,7 @@
array_transpose_8x8_avx2(in, in);
}
-void fadst8_1d_avx2(__m128i *in) {
+void fadst8_avx2(__m128i *in) {
// Constants
const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
@@ -1034,20 +1034,20 @@
load_buffer_8x8_avx2(input, in, stride);
switch (tx_type) {
case 0: // DCT_DCT
- fdct8_1d_avx2(in);
- fdct8_1d_avx2(in);
+ fdct8_avx2(in);
+ fdct8_avx2(in);
break;
case 1: // ADST_DCT
- fadst8_1d_avx2(in);
- fdct8_1d_avx2(in);
+ fadst8_avx2(in);
+ fdct8_avx2(in);
break;
case 2: // DCT_ADST
- fdct8_1d_avx2(in);
- fadst8_1d_avx2(in);
+ fdct8_avx2(in);
+ fadst8_avx2(in);
break;
case 3: // ADST_ADST
- fadst8_1d_avx2(in);
- fadst8_1d_avx2(in);
+ fadst8_avx2(in);
+ fadst8_avx2(in);
break;
default:
assert(0);
@@ -1216,7 +1216,7 @@
step1_6 = _mm_sub_epi16(in01, in14);
step1_7 = _mm_sub_epi16(in00, in15);
}
- // Work on the first eight values; fdct8_1d(input, even_results);
+ // Work on the first eight values; fdct8(input, even_results);
{
// Add/substract
const __m128i q0 = _mm_add_epi16(input0, input7);
@@ -1730,7 +1730,7 @@
right_shift_8x8_avx2(res1 + 8, 2);
}
-void fdct16_1d_8col_avx2(__m128i *in) {
+void fdct16_8col_avx2(__m128i *in) {
// perform 16x16 1-D DCT for 8 columns
__m128i i[8], s[8], p[8], t[8], u[16], v[16];
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
@@ -2052,7 +2052,7 @@
in[15] = _mm_packs_epi32(v[14], v[15]);
}
-void fadst16_1d_8col_avx2(__m128i *in) {
+void fadst16_8col_avx2(__m128i *in) {
// perform 16x16 1-D ADST for 8 columns
__m128i s[16], x[16], u[32], v[32];
const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
@@ -2522,15 +2522,15 @@
in[15] = _mm_sub_epi16(kZero, s[1]);
}
-void fdct16_1d_avx2(__m128i *in0, __m128i *in1) {
- fdct16_1d_8col_avx2(in0);
- fdct16_1d_8col_avx2(in1);
+void fdct16_avx2(__m128i *in0, __m128i *in1) {
+ fdct16_8col_avx2(in0);
+ fdct16_8col_avx2(in1);
array_transpose_16x16_avx2(in0, in1);
}
-void fadst16_1d_avx2(__m128i *in0, __m128i *in1) {
- fadst16_1d_8col_avx2(in0);
- fadst16_1d_8col_avx2(in1);
+void fadst16_avx2(__m128i *in0, __m128i *in1) {
+ fadst16_8col_avx2(in0);
+ fadst16_8col_avx2(in1);
array_transpose_16x16_avx2(in0, in1);
}
@@ -2540,24 +2540,24 @@
load_buffer_16x16_avx2(input, in0, in1, stride);
switch (tx_type) {
case 0: // DCT_DCT
- fdct16_1d_avx2(in0, in1);
+ fdct16_avx2(in0, in1);
right_shift_16x16_avx2(in0, in1);
- fdct16_1d_avx2(in0, in1);
+ fdct16_avx2(in0, in1);
break;
case 1: // ADST_DCT
- fadst16_1d_avx2(in0, in1);
+ fadst16_avx2(in0, in1);
right_shift_16x16_avx2(in0, in1);
- fdct16_1d_avx2(in0, in1);
+ fdct16_avx2(in0, in1);
break;
case 2: // DCT_ADST
- fdct16_1d_avx2(in0, in1);
+ fdct16_avx2(in0, in1);
right_shift_16x16_avx2(in0, in1);
- fadst16_1d_avx2(in0, in1);
+ fadst16_avx2(in0, in1);
break;
case 3: // ADST_ADST
- fadst16_1d_avx2(in0, in1);
+ fadst16_avx2(in0, in1);
right_shift_16x16_avx2(in0, in1);
- fadst16_1d_avx2(in0, in1);
+ fadst16_avx2(in0, in1);
break;
default:
assert(0);
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index 65431bd..c876cc2 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
@@ -161,7 +161,7 @@
res[3] = _mm_unpackhi_epi64(res[2], res[2]);
}
-void fdct4_1d_sse2(__m128i *in) {
+void fdct4_sse2(__m128i *in) {
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
@@ -194,7 +194,7 @@
transpose_4x4(in);
}
-void fadst4_1d_sse2(__m128i *in) {
+void fadst4_sse2(__m128i *in) {
const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
@@ -248,20 +248,20 @@
load_buffer_4x4(input, in, stride);
switch (tx_type) {
case 0: // DCT_DCT
- fdct4_1d_sse2(in);
- fdct4_1d_sse2(in);
+ fdct4_sse2(in);
+ fdct4_sse2(in);
break;
case 1: // ADST_DCT
- fadst4_1d_sse2(in);
- fdct4_1d_sse2(in);
+ fadst4_sse2(in);
+ fdct4_sse2(in);
break;
case 2: // DCT_ADST
- fdct4_1d_sse2(in);
- fadst4_1d_sse2(in);
+ fdct4_sse2(in);
+ fadst4_sse2(in);
break;
case 3: // ADST_ADST
- fadst4_1d_sse2(in);
- fadst4_1d_sse2(in);
+ fadst4_sse2(in);
+ fadst4_sse2(in);
break;
default:
assert(0);
@@ -656,7 +656,7 @@
// 07 17 27 37 47 57 67 77
}
-void fdct8_1d_sse2(__m128i *in) {
+void fdct8_sse2(__m128i *in) {
// constants
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
@@ -796,7 +796,7 @@
array_transpose_8x8(in, in);
}
-void fadst8_1d_sse2(__m128i *in) {
+void fadst8_sse2(__m128i *in) {
// Constants
const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
@@ -1032,20 +1032,20 @@
load_buffer_8x8(input, in, stride);
switch (tx_type) {
case 0: // DCT_DCT
- fdct8_1d_sse2(in);
- fdct8_1d_sse2(in);
+ fdct8_sse2(in);
+ fdct8_sse2(in);
break;
case 1: // ADST_DCT
- fadst8_1d_sse2(in);
- fdct8_1d_sse2(in);
+ fadst8_sse2(in);
+ fdct8_sse2(in);
break;
case 2: // DCT_ADST
- fdct8_1d_sse2(in);
- fadst8_1d_sse2(in);
+ fdct8_sse2(in);
+ fadst8_sse2(in);
break;
case 3: // ADST_ADST
- fadst8_1d_sse2(in);
- fadst8_1d_sse2(in);
+ fadst8_sse2(in);
+ fadst8_sse2(in);
break;
default:
assert(0);
@@ -1214,7 +1214,7 @@
step1_6 = _mm_sub_epi16(in01, in14);
step1_7 = _mm_sub_epi16(in00, in15);
}
- // Work on the first eight values; fdct8_1d(input, even_results);
+ // Work on the first eight values; fdct8(input, even_results);
{
// Add/substract
const __m128i q0 = _mm_add_epi16(input0, input7);
@@ -1728,7 +1728,7 @@
right_shift_8x8(res1 + 8, 2);
}
-void fdct16_1d_8col(__m128i *in) {
+void fdct16_8col(__m128i *in) {
// perform 16x16 1-D DCT for 8 columns
__m128i i[8], s[8], p[8], t[8], u[16], v[16];
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
@@ -2050,7 +2050,7 @@
in[15] = _mm_packs_epi32(v[14], v[15]);
}
-void fadst16_1d_8col(__m128i *in) {
+void fadst16_8col(__m128i *in) {
// perform 16x16 1-D ADST for 8 columns
__m128i s[16], x[16], u[32], v[32];
const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
@@ -2520,15 +2520,15 @@
in[15] = _mm_sub_epi16(kZero, s[1]);
}
-void fdct16_1d_sse2(__m128i *in0, __m128i *in1) {
- fdct16_1d_8col(in0);
- fdct16_1d_8col(in1);
+void fdct16_sse2(__m128i *in0, __m128i *in1) {
+ fdct16_8col(in0);
+ fdct16_8col(in1);
array_transpose_16x16(in0, in1);
}
-void fadst16_1d_sse2(__m128i *in0, __m128i *in1) {
- fadst16_1d_8col(in0);
- fadst16_1d_8col(in1);
+void fadst16_sse2(__m128i *in0, __m128i *in1) {
+ fadst16_8col(in0);
+ fadst16_8col(in1);
array_transpose_16x16(in0, in1);
}
@@ -2538,24 +2538,24 @@
load_buffer_16x16(input, in0, in1, stride);
switch (tx_type) {
case 0: // DCT_DCT
- fdct16_1d_sse2(in0, in1);
+ fdct16_sse2(in0, in1);
right_shift_16x16(in0, in1);
- fdct16_1d_sse2(in0, in1);
+ fdct16_sse2(in0, in1);
break;
case 1: // ADST_DCT
- fadst16_1d_sse2(in0, in1);
+ fadst16_sse2(in0, in1);
right_shift_16x16(in0, in1);
- fdct16_1d_sse2(in0, in1);
+ fdct16_sse2(in0, in1);
break;
case 2: // DCT_ADST
- fdct16_1d_sse2(in0, in1);
+ fdct16_sse2(in0, in1);
right_shift_16x16(in0, in1);
- fadst16_1d_sse2(in0, in1);
+ fadst16_sse2(in0, in1);
break;
case 3: // ADST_ADST
- fadst16_1d_sse2(in0, in1);
+ fadst16_sse2(in0, in1);
right_shift_16x16(in0, in1);
- fadst16_1d_sse2(in0, in1);
+ fadst16_sse2(in0, in1);
break;
default:
assert(0);
diff --git a/vp9_spatial_scalable_encoder.c b/vp9_spatial_scalable_encoder.c
index b637331..e71094a 100644
--- a/vp9_spatial_scalable_encoder.c
+++ b/vp9_spatial_scalable_encoder.c
@@ -89,14 +89,6 @@
exit(EXIT_FAILURE);
}
-static void die_codec(vpx_codec_ctx_t *ctx, const char *s) {
- const char *detail = vpx_codec_error_detail(ctx);
-
- printf("%s: %s\n", s, vpx_codec_error(ctx));
- if (detail) printf(" %s\n", detail);
- exit(EXIT_FAILURE);
-}
-
static void parse_command_line(int argc, const char **argv_,
AppInput *app_input, SvcContext *svc_ctx,
vpx_codec_enc_cfg_t *enc_cfg) {