Merge "Convert to unsigned int before left shift"
diff --git a/build/make/ios-Info.plist b/build/make/ios-Info.plist
index 8d1da32..d157b11 100644
--- a/build/make/ios-Info.plist
+++ b/build/make/ios-Info.plist
@@ -31,5 +31,7 @@
<integer>1</integer>
<integer>2</integer>
</array>
+ <key>VPXFullVersion</key>
+ <string>${FULLVERSION}</string>
</dict>
</plist>
diff --git a/build/make/iosbuild.sh b/build/make/iosbuild.sh
index 2161074..96dc6cc 100755
--- a/build/make/iosbuild.sh
+++ b/build/make/iosbuild.sh
@@ -226,6 +226,7 @@
# Copy in Info.plist.
cat "${SCRIPT_DIR}/ios-Info.plist" \
+ | sed "s/\${FULLVERSION}/${FULLVERSION}/g" \
| sed "s/\${VERSION}/${VERSION}/g" \
| sed "s/\${IOS_VERSION_MIN}/${IOS_VERSION_MIN}/g" \
> "${FRAMEWORK_DIR}/Info.plist"
@@ -341,8 +342,9 @@
CONFIGURE_ARGS="--enable-shared ${CONFIGURE_ARGS}"
fi
-VERSION=$("${SCRIPT_DIR}"/version.sh --bare "${LIBVPX_SOURCE_DIR}" \
- | sed -E 's/^v(.*)$/\1/')
+FULLVERSION=$("${SCRIPT_DIR}"/version.sh --bare "${LIBVPX_SOURCE_DIR}")
+VERSION=$(echo "${FULLVERSION}" | sed -E 's/^v([0-9]+\.[0-9]+\.[0-9]+).*$/\1/')
+
if [ "$ENABLE_SHARED" = "yes" ]; then
IOS_VERSION_OPTIONS="--enable-shared"
else
@@ -369,6 +371,7 @@
OSX_TARGETS="${OSX_TARGETS}"
SIM_TARGETS="${SIM_TARGETS}"
SCRIPT_DIR="${SCRIPT_DIR}"
+ FULLVERSION="${FULLVERSION}"
VERSION="${VERSION}"
IOS_VERSION_MIN="${IOS_VERSION_MIN}"
EOF
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 5467c46..3941e16 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -450,7 +450,28 @@
int denoiser_offon_period_;
};
-// Check basic rate targeting,
+// Check basic rate targeting for VBR mode.
+TEST_P(DatarateTestVP9Large, BasicRateTargetingVBR) {
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.g_error_resilient = 0;
+ cfg_.rc_end_usage = VPX_VBR;
+ cfg_.g_lag_in_frames = 0;
+
+ ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ 30, 1, 0, 300);
+ for (int i = 400; i <= 800; i += 400) {
+ cfg_.rc_target_bitrate = i;
+ ResetModel();
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25)
+ << " The datarate for the file is greater than target by too much!";
+ }
+}
+
+// Check basic rate targeting for CBR,
TEST_P(DatarateTestVP9Large, BasicRateTargeting) {
cfg_.rc_buf_initial_sz = 500;
cfg_.rc_buf_optimal_sz = 500;
@@ -474,7 +495,7 @@
}
}
-// Check basic rate targeting,
+// Check basic rate targeting for CBR.
TEST_P(DatarateTestVP9Large, BasicRateTargeting444) {
::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140);
diff --git a/test/variance_test.cc b/test/variance_test.cc
index a6efc92..e2f6385 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1026,8 +1026,8 @@
make_tuple(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0),
make_tuple(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0),
make_tuple(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0),
- make_tuple(2, 3, &vpx_sub_pixel_variance4x8_sse, 0),
- make_tuple(2, 2, &vpx_sub_pixel_variance4x4_sse, 0)));
+ make_tuple(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0),
+ make_tuple(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0)));
INSTANTIATE_TEST_CASE_P(
SSE2, VpxSubpelAvgVarianceTest,
@@ -1043,8 +1043,8 @@
make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0),
make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0),
make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0),
- make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse, 0),
- make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse, 0)));
+ make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0),
+ make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0)));
#endif // CONFIG_USE_X86INC
#if CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 6e21bb1..d639129 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -1339,22 +1339,23 @@
// has valid dimensions.
for (i = 0; i < REFS_PER_FRAME; ++i) {
RefBuffer *const ref_frame = &cm->frame_refs[i];
- has_valid_ref_frame |= valid_ref_frame_size(ref_frame->buf->y_crop_width,
- ref_frame->buf->y_crop_height,
- width, height);
+ has_valid_ref_frame |= (ref_frame->idx != INVALID_IDX &&
+ valid_ref_frame_size(ref_frame->buf->y_crop_width,
+ ref_frame->buf->y_crop_height,
+ width, height));
}
if (!has_valid_ref_frame)
vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
"Referenced frame has invalid size");
for (i = 0; i < REFS_PER_FRAME; ++i) {
RefBuffer *const ref_frame = &cm->frame_refs[i];
- if (!valid_ref_frame_img_fmt(
- ref_frame->buf->bit_depth,
- ref_frame->buf->subsampling_x,
- ref_frame->buf->subsampling_y,
- cm->bit_depth,
- cm->subsampling_x,
- cm->subsampling_y))
+ if (ref_frame->idx == INVALID_IDX ||
+ !valid_ref_frame_img_fmt(ref_frame->buf->bit_depth,
+ ref_frame->buf->subsampling_x,
+ ref_frame->buf->subsampling_y,
+ cm->bit_depth,
+ cm->subsampling_x,
+ cm->subsampling_y))
vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
"Referenced frame has incompatible color format");
}
diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
deleted file mode 100644
index 0bc417f..0000000
--- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
+++ /dev/null
@@ -1,323 +0,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#if defined(_MSC_VER)
-# include <intrin.h>
-#endif
-#include <emmintrin.h>
-#include <smmintrin.h>
-
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vp9/encoder/vp9_encoder.h"
-#include "vpx_ports/mem.h"
-
-#ifdef __GNUC__
-# define LIKELY(v) __builtin_expect(v, 1)
-# define UNLIKELY(v) __builtin_expect(v, 0)
-#else
-# define LIKELY(v) (v)
-# define UNLIKELY(v) (v)
-#endif
-
-static INLINE int_mv pack_int_mv(int16_t row, int16_t col) {
- int_mv result;
- result.as_mv.row = row;
- result.as_mv.col = col;
- return result;
-}
-
-static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) {
- // This is simplified from the C implementation to utilise that
- // x->nmvjointsadcost[1] == x->nmvjointsadcost[2] and
- // x->nmvjointsadcost[1] == x->nmvjointsadcost[3]
- return mv.as_int == 0 ? 0 : 1;
-}
-
-static INLINE int mv_cost(const int_mv mv,
- const int *joint_cost, int *const comp_cost[2]) {
- return joint_cost[get_mv_joint(mv)] +
- comp_cost[0][mv.as_mv.row] + comp_cost[1][mv.as_mv.col];
-}
-
-static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref,
- int sad_per_bit) {
- const int_mv diff = pack_int_mv(mv.as_mv.row - ref->row,
- mv.as_mv.col - ref->col);
- return ROUND_POWER_OF_TWO((unsigned)mv_cost(diff, x->nmvjointsadcost,
- x->nmvsadcost) *
- sad_per_bit, VP9_PROB_COST_SHIFT);
-}
-
-/*****************************************************************************
- * This function utilises 3 properties of the cost function lookup tables, *
- * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in *
- * vp9_encoder.c. *
- * For the joint cost: *
- * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] *
- * For the component costs: *
- * - For all i: mvsadcost[0][i] == mvsadcost[1][i] *
- * (Equal costs for both components) *
- * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] *
- * (Cost function is even) *
- * If these do not hold, then this function cannot be used without *
- * modification, in which case you can revert to using the C implementation, *
- * which does not rely on these properties. *
- *****************************************************************************/
-int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
- const search_site_config *cfg,
- MV *ref_mv, MV *best_mv, int search_param,
- int sad_per_bit, int *num00,
- const vp9_variance_fn_ptr_t *fn_ptr,
- const MV *center_mv) {
- const int_mv maxmv = pack_int_mv(x->mv_row_max, x->mv_col_max);
- const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int);
- const int_mv minmv = pack_int_mv(x->mv_row_min, x->mv_col_min);
- const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int);
-
- const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit);
-
- const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]);
- const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]);
-
- // search_param determines the length of the initial step and hence the number
- // of iterations.
- // 0 = initial step (MAX_FIRST_STEP) pel
- // 1 = (MAX_FIRST_STEP/2) pel,
- // 2 = (MAX_FIRST_STEP/4) pel...
- const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
- const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
- const int tot_steps = cfg->total_steps - search_param;
-
- const int_mv fcenter_mv = pack_int_mv(center_mv->row >> 3,
- center_mv->col >> 3);
- const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int);
-
- const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
- const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);
-
- int_mv bmv = pack_int_mv(ref_row, ref_col);
- int_mv new_bmv = bmv;
- __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int);
-
- const int what_stride = x->plane[0].src.stride;
- const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
- const uint8_t *const what = x->plane[0].src.buf;
- const uint8_t *const in_what = x->e_mbd.plane[0].pre[0].buf +
- ref_row * in_what_stride + ref_col;
-
- // Work out the start point for the search
- const uint8_t *best_address = in_what;
- const uint8_t *new_best_address = best_address;
-#if ARCH_X86_64
- __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
-#else
- __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address);
-#endif
-
- unsigned int best_sad;
-
- int i;
- int j;
- int step;
-
- // Check the prerequisite cost function properties that are easy to check
- // in an assert. See the function-level documentation for details on all
- // prerequisites.
- assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
- assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);
-
- // Check the starting position
- best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride);
- best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit);
-
- *num00 = 0;
-
- for (i = 0, step = 0; step < tot_steps; step++) {
- for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
- __m128i v_sad_d;
- __m128i v_cost_d;
- __m128i v_outside_d;
- __m128i v_inside_d;
- __m128i v_diff_mv_w;
-#if ARCH_X86_64
- __m128i v_blocka[2];
-#else
- __m128i v_blocka[1];
-#endif
-
- // Compute the candidate motion vectors
- const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i*)&ss_mv[i]);
- const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w);
- // Clamp them to the search bounds
- __m128i v_these_mv_clamp_w = v_these_mv_w;
- v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w);
- v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w);
- // The ones that did not change are inside the search area
- v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w);
-
- // If none of them are inside, then move on
- if (LIKELY(_mm_test_all_zeros(v_inside_d, v_inside_d))) {
- continue;
- }
-
- // The inverse mask indicates which of the MVs are outside
- v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8(0xff));
- // Shift right to keep the sign bit clear, we will use this later
- // to set the cost to the maximum value.
- v_outside_d = _mm_srli_epi32(v_outside_d, 1);
-
- // Compute the difference MV
- v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv);
- // We utilise the fact that the cost function is even, and use the
- // absolute difference. This allows us to use unsigned indexes later
- // and reduces cache pressure somewhat as only a half of the table
- // is ever referenced.
- v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w);
-
- // Compute the SIMD pointer offsets.
- {
-#if ARCH_X86_64 // sizeof(intptr_t) == 8
- // Load the offsets
- __m128i v_bo10_q = _mm_loadu_si128((const __m128i*)&ss_os[i+0]);
- __m128i v_bo32_q = _mm_loadu_si128((const __m128i*)&ss_os[i+2]);
- // Set the ones falling outside to zero
- v_bo10_q = _mm_and_si128(v_bo10_q,
- _mm_cvtepi32_epi64(v_inside_d));
- v_bo32_q = _mm_and_si128(v_bo32_q,
- _mm_unpackhi_epi32(v_inside_d, v_inside_d));
- // Compute the candidate addresses
- v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q);
- v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q);
-#else // ARCH_X86 // sizeof(intptr_t) == 4
- __m128i v_bo_d = _mm_loadu_si128((const __m128i*)&ss_os[i]);
- v_bo_d = _mm_and_si128(v_bo_d, v_inside_d);
- v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d);
-#endif
- }
-
- fn_ptr->sdx4df(what, what_stride,
- (const uint8_t **)&v_blocka[0], in_what_stride,
- (uint32_t*)&v_sad_d);
-
- // Look up the component cost of the residual motion vector
- {
- const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0);
- const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1);
- const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2);
- const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3);
- const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4);
- const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5);
- const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6);
- const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7);
-
- // Note: This is a use case for vpgather in AVX2
- const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0];
- const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1];
- const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2];
- const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3];
-
- __m128i v_cost_10_d, v_cost_32_d;
-
- v_cost_10_d = _mm_cvtsi32_si128(cost0);
- v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1);
-
- v_cost_32_d = _mm_cvtsi32_si128(cost2);
- v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1);
-
- v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d);
- }
-
- // Now add in the joint cost
- {
- const __m128i v_sel_d = _mm_cmpeq_epi32(v_diff_mv_w,
- _mm_setzero_si128());
- const __m128i v_joint_cost_d = _mm_blendv_epi8(v_joint_cost_1_d,
- v_joint_cost_0_d,
- v_sel_d);
- v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d);
- }
-
- // Multiply by sad_per_bit
- v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d);
- // ROUND_POWER_OF_TWO(v_cost_d, 8)
- v_cost_d = _mm_add_epi32(v_cost_d, _mm_set1_epi32(0x80));
- v_cost_d = _mm_srai_epi32(v_cost_d, 8);
- // Add the cost to the sad
- v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d);
-
- // Make the motion vectors outside the search area have max cost
- // by or'ing in the comparison mask, this way the minimum search won't
- // pick them.
- v_sad_d = _mm_or_si128(v_sad_d, v_outside_d);
-
- // Find the minimum value and index horizontally in v_sad_d
- {
- // Try speculatively on 16 bits, so we can use the minpos intrinsic
- const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d);
- const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w);
-
- uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0);
- uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1);
-
- // If the local best value is not saturated, just use it, otherwise
- // find the horizontal minimum again the hard way on 32 bits.
- // This is executed rarely.
- if (UNLIKELY(local_best_sad == 0xffff)) {
- __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d;
-
- v_loval_d = v_sad_d;
- v_loidx_d = _mm_set_epi32(3, 2, 1, 0);
- v_hival_d = _mm_srli_si128(v_loval_d, 8);
- v_hiidx_d = _mm_srli_si128(v_loidx_d, 8);
-
- v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
-
- v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
- v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
- v_hival_d = _mm_srli_si128(v_loval_d, 4);
- v_hiidx_d = _mm_srli_si128(v_loidx_d, 4);
-
- v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
-
- v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
- v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
-
- local_best_sad = _mm_extract_epi32(v_loval_d, 0);
- local_best_idx = _mm_extract_epi32(v_loidx_d, 0);
- }
-
- // Update the global minimum if the local minimum is smaller
- if (LIKELY(local_best_sad < best_sad)) {
- new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
- new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];
-
- best_sad = local_best_sad;
- }
- }
- }
-
- bmv = new_bmv;
- best_address = new_best_address;
-
- v_bmv_w = _mm_set1_epi32(bmv.as_int);
-#if ARCH_X86_64
- v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
-#else
- v_ba_d = _mm_set1_epi32((intptr_t)best_address);
-#endif
-
- if (UNLIKELY(best_address == in_what)) {
- (*num00)++;
- }
- }
-
- *best_mv = bmv.as_mv;
- return best_sad;
-}
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 2930c23..01b7aa5 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -96,7 +96,6 @@
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
-VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
endif
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index d556117..ee24d13 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1493,10 +1493,10 @@
specialize qw/vpx_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
+ specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance4x4 mmx msa/, "$sse_x86inc", "$ssse3_x86inc";
+ specialize qw/vpx_sub_pixel_variance4x4 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -1532,10 +1532,10 @@
specialize qw/vpx_sub_pixel_avg_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
+ specialize qw/vpx_sub_pixel_avg_variance4x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance4x4 msa/, "$sse_x86inc", "$ssse3_x86inc";
+ specialize qw/vpx_sub_pixel_avg_variance4x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
#
# Specialty Subpixel
diff --git a/vpx_dsp/x86/subpel_variance_sse2.asm b/vpx_dsp/x86/subpel_variance_sse2.asm
index be35975..cee4468 100644
--- a/vpx_dsp/x86/subpel_variance_sse2.asm
+++ b/vpx_dsp/x86/subpel_variance_sse2.asm
@@ -57,8 +57,8 @@
paddd %6, %1
%endmacro
-%macro STORE_AND_RET 0
-%if mmsize == 16
+%macro STORE_AND_RET 1
+%if %1 > 4
; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
; We have to sign-extend it before adding the words within the register
@@ -78,16 +78,16 @@
movd [r1], m7 ; store sse
paddd m6, m4
movd raxd, m6 ; store sum as return value
-%else ; mmsize == 8
- pshufw m4, m6, 0xe
- pshufw m3, m7, 0xe
+%else ; 4xh
+ pshuflw m4, m6, 0xe
+ pshuflw m3, m7, 0xe
paddw m6, m4
paddd m7, m3
pcmpgtw m5, m6 ; mask for 0 > x
mov r1, ssem ; r1 = unsigned int *sse
punpcklwd m6, m5 ; sign-extend m6 word->dword
movd [r1], m7 ; store sse
- pshufw m4, m6, 0xe
+ pshuflw m4, m6, 0xe
paddd m6, m4
movd raxd, m6 ; store sum as return value
%endif
@@ -196,6 +196,12 @@
%endif
%endif
+%if %1 == 4
+ %define movx movd
+%else
+ %define movx movh
+%endif
+
ASSERT %1 <= 16 ; m6 overflows if w > 16
pxor m6, m6 ; sum
pxor m7, m7 ; sse
@@ -228,6 +234,7 @@
%endif
punpckhbw m2, m0, m5
punpcklbw m0, m5
+
%if %2 == 0 ; !avg
punpckhbw m3, m1, m5
punpcklbw m1, m5
@@ -237,24 +244,37 @@
add srcq, src_strideq
add dstq, dst_strideq
%else ; %1 < 16
- movh m0, [srcq]
+ movx m0, [srcq]
%if %2 == 1 ; avg
-%if mmsize == 16
+%if %1 > 4
movhps m0, [srcq+src_strideq]
-%else ; mmsize == 8
- punpckldq m0, [srcq+src_strideq]
+%else ; 4xh
+ movx m1, [srcq+src_strideq]
+ punpckldq m0, m1
%endif
%else ; !avg
- movh m2, [srcq+src_strideq]
+ movx m2, [srcq+src_strideq]
%endif
- movh m1, [dstq]
- movh m3, [dstq+dst_strideq]
+
+ movx m1, [dstq]
+ movx m3, [dstq+dst_strideq]
+
%if %2 == 1 ; avg
+%if %1 > 4
pavgb m0, [secq]
+%else
+ movh m2, [secq]
+ pavgb m0, m2
+%endif
punpcklbw m3, m5
punpcklbw m1, m5
+%if %1 > 4
punpckhbw m2, m0, m5
punpcklbw m0, m5
+%else ; 4xh
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
%else ; !avg
punpcklbw m0, m5
punpcklbw m2, m5
@@ -271,7 +291,7 @@
%endif
dec block_height
jg .x_zero_y_zero_loop
- STORE_AND_RET
+ STORE_AND_RET %1
.x_zero_y_nonzero:
cmp y_offsetd, 4
@@ -296,37 +316,41 @@
add srcq, src_strideq
add dstq, dst_strideq
%else ; %1 < 16
- movh m0, [srcq]
- movh m2, [srcq+src_strideq]
+ movx m0, [srcq]
+ movx m2, [srcq+src_strideq]
%if %2 == 1 ; avg
-%if mmsize == 16
+%if %1 > 4
movhps m2, [srcq+src_strideq*2]
-%else ; mmsize == 8
-%if %1 == 4
- movh m1, [srcq+src_strideq*2]
+%else ; 4xh
+ movx m1, [srcq+src_strideq*2]
punpckldq m2, m1
-%else
- punpckldq m2, [srcq+src_strideq*2]
%endif
-%endif
- movh m1, [dstq]
-%if mmsize == 16
+ movx m1, [dstq]
+%if %1 > 4
movlhps m0, m2
-%else ; mmsize == 8
+%else ; 4xh
punpckldq m0, m2
%endif
- movh m3, [dstq+dst_strideq]
+ movx m3, [dstq+dst_strideq]
pavgb m0, m2
punpcklbw m1, m5
+%if %1 > 4
pavgb m0, [secq]
punpcklbw m3, m5
punpckhbw m2, m0, m5
punpcklbw m0, m5
+%else ; 4xh
+ movh m4, [secq]
+ pavgb m0, m4
+ punpcklbw m3, m5
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
%else ; !avg
- movh m4, [srcq+src_strideq*2]
- movh m1, [dstq]
+ movx m4, [srcq+src_strideq*2]
+ movx m1, [dstq]
pavgb m0, m2
- movh m3, [dstq+dst_strideq]
+ movx m3, [dstq+dst_strideq]
pavgb m2, m4
punpcklbw m0, m5
punpcklbw m2, m5
@@ -343,7 +367,7 @@
%endif
dec block_height
jg .x_zero_y_half_loop
- STORE_AND_RET
+ STORE_AND_RET %1
.x_zero_y_nonhalf:
; x_offset == 0 && y_offset == bilin interpolation
@@ -351,7 +375,7 @@
lea bilin_filter, [bilin_filter_m]
%endif
shl y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if ARCH_X86_64 && %1 > 4
mova m8, [bilin_filter+y_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m9, [bilin_filter+y_offsetq+16]
@@ -424,12 +448,12 @@
add srcq, src_strideq
add dstq, dst_strideq
%else ; %1 < 16
- movh m0, [srcq]
- movh m2, [srcq+src_strideq]
- movh m4, [srcq+src_strideq*2]
- movh m3, [dstq+dst_strideq]
+ movx m0, [srcq]
+ movx m2, [srcq+src_strideq]
+ movx m4, [srcq+src_strideq*2]
+ movx m3, [dstq+dst_strideq]
%if cpuflag(ssse3)
- movh m1, [dstq]
+ movx m1, [dstq]
punpcklbw m0, m2
punpcklbw m2, m4
pmaddubsw m0, filter_y_a
@@ -449,17 +473,27 @@
pmullw m4, filter_y_b
paddw m0, m1
paddw m2, filter_rnd
- movh m1, [dstq]
+ movx m1, [dstq]
paddw m2, m4
%endif
psraw m0, 4
psraw m2, 4
%if %2 == 1 ; avg
; FIXME(rbultje) pipeline
+%if %1 == 4
+ movlhps m0, m2
+%endif
packuswb m0, m2
+%if %1 > 4
pavgb m0, [secq]
punpckhbw m2, m0, m5
punpcklbw m0, m5
+%else ; 4xh
+ movh m2, [secq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
%endif
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
@@ -475,7 +509,7 @@
%undef filter_y_a
%undef filter_y_b
%undef filter_rnd
- STORE_AND_RET
+ STORE_AND_RET %1
.x_nonzero:
cmp x_offsetd, 4
@@ -503,30 +537,40 @@
add srcq, src_strideq
add dstq, dst_strideq
%else ; %1 < 16
- movh m0, [srcq]
- movh m4, [srcq+1]
+ movx m0, [srcq]
+ movx m4, [srcq+1]
%if %2 == 1 ; avg
-%if mmsize == 16
+%if %1 > 4
movhps m0, [srcq+src_strideq]
movhps m4, [srcq+src_strideq+1]
-%else ; mmsize == 8
- punpckldq m0, [srcq+src_strideq]
- punpckldq m4, [srcq+src_strideq+1]
+%else ; 4xh
+ movx m1, [srcq+src_strideq]
+ punpckldq m0, m1
+ movx m2, [srcq+src_strideq+1]
+ punpckldq m4, m2
%endif
- movh m1, [dstq]
- movh m3, [dstq+dst_strideq]
+ movx m1, [dstq]
+ movx m3, [dstq+dst_strideq]
pavgb m0, m4
punpcklbw m3, m5
+%if %1 > 4
pavgb m0, [secq]
punpcklbw m1, m5
punpckhbw m2, m0, m5
punpcklbw m0, m5
+%else ; 4xh
+ movh m2, [secq]
+ pavgb m0, m2
+ punpcklbw m1, m5
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
%else ; !avg
- movh m2, [srcq+src_strideq]
- movh m1, [dstq]
+ movx m2, [srcq+src_strideq]
+ movx m1, [dstq]
pavgb m0, m4
- movh m4, [srcq+src_strideq+1]
- movh m3, [dstq+dst_strideq]
+ movx m4, [srcq+src_strideq+1]
+ movx m3, [dstq+dst_strideq]
pavgb m2, m4
punpcklbw m0, m5
punpcklbw m2, m5
@@ -543,7 +587,7 @@
%endif
dec block_height
jg .x_half_y_zero_loop
- STORE_AND_RET
+ STORE_AND_RET %1
.x_half_y_nonzero:
cmp y_offsetd, 4
@@ -578,53 +622,58 @@
add srcq, src_strideq
add dstq, dst_strideq
%else ; %1 < 16
- movh m0, [srcq]
- movh m3, [srcq+1]
+ movx m0, [srcq]
+ movx m3, [srcq+1]
add srcq, src_strideq
pavgb m0, m3
.x_half_y_half_loop:
- movh m2, [srcq]
- movh m3, [srcq+1]
+ movx m2, [srcq]
+ movx m3, [srcq+1]
%if %2 == 1 ; avg
-%if mmsize == 16
+%if %1 > 4
movhps m2, [srcq+src_strideq]
movhps m3, [srcq+src_strideq+1]
%else
-%if %1 == 4
- movh m1, [srcq+src_strideq]
+ movx m1, [srcq+src_strideq]
punpckldq m2, m1
- movh m1, [srcq+src_strideq+1]
+ movx m1, [srcq+src_strideq+1]
punpckldq m3, m1
-%else
- punpckldq m2, [srcq+src_strideq]
- punpckldq m3, [srcq+src_strideq+1]
-%endif
%endif
pavgb m2, m3
-%if mmsize == 16
+%if %1 > 4
movlhps m0, m2
movhlps m4, m2
-%else ; mmsize == 8
+%else ; 4xh
punpckldq m0, m2
- pshufw m4, m2, 0xe
+ pshuflw m4, m2, 0xe
%endif
- movh m1, [dstq]
+ movx m1, [dstq]
pavgb m0, m2
- movh m3, [dstq+dst_strideq]
+ movx m3, [dstq+dst_strideq]
+%if %1 > 4
pavgb m0, [secq]
+%else
+ movh m2, [secq]
+ pavgb m0, m2
+%endif
punpcklbw m3, m5
punpcklbw m1, m5
+%if %1 > 4
punpckhbw m2, m0, m5
punpcklbw m0, m5
+%else
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
%else ; !avg
- movh m4, [srcq+src_strideq]
- movh m1, [srcq+src_strideq+1]
+ movx m4, [srcq+src_strideq]
+ movx m1, [srcq+src_strideq+1]
pavgb m2, m3
pavgb m4, m1
pavgb m0, m2
pavgb m2, m4
- movh m1, [dstq]
- movh m3, [dstq+dst_strideq]
+ movx m1, [dstq]
+ movx m3, [dstq+dst_strideq]
punpcklbw m0, m5
punpcklbw m2, m5
punpcklbw m3, m5
@@ -641,7 +690,7 @@
%endif
dec block_height
jg .x_half_y_half_loop
- STORE_AND_RET
+ STORE_AND_RET %1
.x_half_y_nonhalf:
; x_offset == 0.5 && y_offset == bilin interpolation
@@ -649,7 +698,7 @@
lea bilin_filter, [bilin_filter_m]
%endif
shl y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if ARCH_X86_64 && %1 > 4
mova m8, [bilin_filter+y_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m9, [bilin_filter+y_offsetq+16]
@@ -724,23 +773,23 @@
add srcq, src_strideq
add dstq, dst_strideq
%else ; %1 < 16
- movh m0, [srcq]
- movh m3, [srcq+1]
+ movx m0, [srcq]
+ movx m3, [srcq+1]
add srcq, src_strideq
pavgb m0, m3
%if notcpuflag(ssse3)
punpcklbw m0, m5
%endif
.x_half_y_other_loop:
- movh m2, [srcq]
- movh m1, [srcq+1]
- movh m4, [srcq+src_strideq]
- movh m3, [srcq+src_strideq+1]
+ movx m2, [srcq]
+ movx m1, [srcq+1]
+ movx m4, [srcq+src_strideq]
+ movx m3, [srcq+src_strideq+1]
pavgb m2, m1
pavgb m4, m3
- movh m3, [dstq+dst_strideq]
+ movx m3, [dstq+dst_strideq]
%if cpuflag(ssse3)
- movh m1, [dstq]
+ movx m1, [dstq]
punpcklbw m0, m2
punpcklbw m2, m4
pmaddubsw m0, filter_y_a
@@ -760,16 +809,26 @@
pmullw m1, m4, filter_y_b
paddw m2, filter_rnd
paddw m2, m1
- movh m1, [dstq]
+ movx m1, [dstq]
%endif
psraw m0, 4
psraw m2, 4
%if %2 == 1 ; avg
; FIXME(rbultje) pipeline
+%if %1 == 4
+ movlhps m0, m2
+%endif
packuswb m0, m2
+%if %1 > 4
pavgb m0, [secq]
punpckhbw m2, m0, m5
punpcklbw m0, m5
+%else
+ movh m2, [secq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
%endif
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
@@ -786,7 +845,7 @@
%undef filter_y_a
%undef filter_y_b
%undef filter_rnd
- STORE_AND_RET
+ STORE_AND_RET %1
.x_nonhalf:
test y_offsetd, y_offsetd
@@ -797,7 +856,7 @@
lea bilin_filter, [bilin_filter_m]
%endif
shl x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if ARCH_X86_64 && %1 > 4
mova m8, [bilin_filter+x_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m9, [bilin_filter+x_offsetq+16]
@@ -865,14 +924,14 @@
add srcq, src_strideq
add dstq, dst_strideq
%else ; %1 < 16
- movh m0, [srcq]
- movh m1, [srcq+1]
- movh m2, [srcq+src_strideq]
- movh m4, [srcq+src_strideq+1]
- movh m3, [dstq+dst_strideq]
+ movx m0, [srcq]
+ movx m1, [srcq+1]
+ movx m2, [srcq+src_strideq]
+ movx m4, [srcq+src_strideq+1]
+ movx m3, [dstq+dst_strideq]
%if cpuflag(ssse3)
punpcklbw m0, m1
- movh m1, [dstq]
+ movx m1, [dstq]
punpcklbw m2, m4
pmaddubsw m0, filter_x_a
pmaddubsw m2, filter_x_a
@@ -892,17 +951,27 @@
pmullw m4, filter_x_b
paddw m0, m1
paddw m2, filter_rnd
- movh m1, [dstq]
+ movx m1, [dstq]
paddw m2, m4
%endif
psraw m0, 4
psraw m2, 4
%if %2 == 1 ; avg
; FIXME(rbultje) pipeline
+%if %1 == 4
+ movlhps m0, m2
+%endif
packuswb m0, m2
+%if %1 > 4
pavgb m0, [secq]
punpckhbw m2, m0, m5
punpcklbw m0, m5
+%else
+ movh m2, [secq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
%endif
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
@@ -918,7 +987,7 @@
%undef filter_x_a
%undef filter_x_b
%undef filter_rnd
- STORE_AND_RET
+ STORE_AND_RET %1
.x_nonhalf_y_nonzero:
cmp y_offsetd, 4
@@ -929,7 +998,7 @@
lea bilin_filter, [bilin_filter_m]
%endif
shl x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if ARCH_X86_64 && %1 > 4
mova m8, [bilin_filter+x_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m9, [bilin_filter+x_offsetq+16]
@@ -1037,8 +1106,8 @@
add srcq, src_strideq
add dstq, dst_strideq
%else ; %1 < 16
- movh m0, [srcq]
- movh m1, [srcq+1]
+ movx m0, [srcq]
+ movx m1, [srcq+1]
%if cpuflag(ssse3)
punpcklbw m0, m1
pmaddubsw m0, filter_x_a
@@ -1054,17 +1123,17 @@
add srcq, src_strideq
psraw m0, 4
.x_other_y_half_loop:
- movh m2, [srcq]
- movh m1, [srcq+1]
- movh m4, [srcq+src_strideq]
- movh m3, [srcq+src_strideq+1]
+ movx m2, [srcq]
+ movx m1, [srcq+1]
+ movx m4, [srcq+src_strideq]
+ movx m3, [srcq+src_strideq+1]
%if cpuflag(ssse3)
punpcklbw m2, m1
punpcklbw m4, m3
pmaddubsw m2, filter_x_a
pmaddubsw m4, filter_x_a
- movh m1, [dstq]
- movh m3, [dstq+dst_strideq]
+ movx m1, [dstq]
+ movx m3, [dstq+dst_strideq]
paddw m2, filter_rnd
paddw m4, filter_rnd
%else
@@ -1079,9 +1148,9 @@
pmullw m3, filter_x_b
paddw m4, filter_rnd
paddw m2, m1
- movh m1, [dstq]
+ movx m1, [dstq]
paddw m4, m3
- movh m3, [dstq+dst_strideq]
+ movx m3, [dstq+dst_strideq]
%endif
psraw m2, 4
psraw m4, 4
@@ -1089,10 +1158,20 @@
pavgw m2, m4
%if %2 == 1 ; avg
; FIXME(rbultje) pipeline - also consider going to bytes here
+%if %1 == 4
+ movlhps m0, m2
+%endif
packuswb m0, m2
+%if %1 > 4
pavgb m0, [secq]
punpckhbw m2, m0, m5
punpcklbw m0, m5
+%else
+ movh m2, [secq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
%endif
punpcklbw m3, m5
punpcklbw m1, m5
@@ -1110,7 +1189,7 @@
%undef filter_x_a
%undef filter_x_b
%undef filter_rnd
- STORE_AND_RET
+ STORE_AND_RET %1
.x_nonhalf_y_nonhalf:
%ifdef PIC
@@ -1118,7 +1197,7 @@
%endif
shl x_offsetd, filter_idx_shift
shl y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if ARCH_X86_64 && %1 > 4
mova m8, [bilin_filter+x_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m9, [bilin_filter+x_offsetq+16]
@@ -1261,8 +1340,8 @@
INC_SRC_BY_SRC_STRIDE
add dstq, dst_strideq
%else ; %1 < 16
- movh m0, [srcq]
- movh m1, [srcq+1]
+ movx m0, [srcq]
+ movx m1, [srcq+1]
%if cpuflag(ssse3)
punpcklbw m0, m1
pmaddubsw m0, filter_x_a
@@ -1283,20 +1362,20 @@
INC_SRC_BY_SRC_STRIDE
.x_other_y_other_loop:
- movh m2, [srcq]
- movh m1, [srcq+1]
+ movx m2, [srcq]
+ movx m1, [srcq+1]
INC_SRC_BY_SRC_STRIDE
- movh m4, [srcq]
- movh m3, [srcq+1]
+ movx m4, [srcq]
+ movx m3, [srcq+1]
%if cpuflag(ssse3)
punpcklbw m2, m1
punpcklbw m4, m3
pmaddubsw m2, filter_x_a
pmaddubsw m4, filter_x_a
- movh m3, [dstq+dst_strideq]
- movh m1, [dstq]
+ movx m3, [dstq+dst_strideq]
+ movx m1, [dstq]
paddw m2, filter_rnd
paddw m4, filter_rnd
psraw m2, 4
@@ -1335,9 +1414,9 @@
pmullw m1, m4, filter_y_b
paddw m2, filter_rnd
paddw m0, m3
- movh m3, [dstq+dst_strideq]
+ movx m3, [dstq+dst_strideq]
paddw m2, m1
- movh m1, [dstq]
+ movx m1, [dstq]
psraw m0, 4
psraw m2, 4
punpcklbw m3, m5
@@ -1345,10 +1424,20 @@
%endif
%if %2 == 1 ; avg
; FIXME(rbultje) pipeline
+%if %1 == 4
+ movlhps m0, m2
+%endif
packuswb m0, m2
+%if %1 > 4
pavgb m0, [secq]
punpckhbw m2, m0, m5
punpcklbw m0, m5
+%else
+ movh m2, [secq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
%endif
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
@@ -1366,7 +1455,8 @@
%undef filter_y_a
%undef filter_y_b
%undef filter_rnd
- STORE_AND_RET
+%undef movx
+ STORE_AND_RET %1
%endmacro
; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
@@ -1375,26 +1465,22 @@
; location in the sse/2 version, rather than duplicating that code in the
; binary.
-INIT_MMX sse
-SUBPEL_VARIANCE 4
INIT_XMM sse2
+SUBPEL_VARIANCE 4
SUBPEL_VARIANCE 8
SUBPEL_VARIANCE 16
-INIT_MMX ssse3
-SUBPEL_VARIANCE 4
INIT_XMM ssse3
+SUBPEL_VARIANCE 4
SUBPEL_VARIANCE 8
SUBPEL_VARIANCE 16
-INIT_MMX sse
-SUBPEL_VARIANCE 4, 1
INIT_XMM sse2
+SUBPEL_VARIANCE 4, 1
SUBPEL_VARIANCE 8, 1
SUBPEL_VARIANCE 16, 1
-INIT_MMX ssse3
-SUBPEL_VARIANCE 4, 1
INIT_XMM ssse3
+SUBPEL_VARIANCE 4, 1
SUBPEL_VARIANCE 8, 1
SUBPEL_VARIANCE 16, 1
diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c
index 43f4603..6987c2e 100644
--- a/vpx_dsp/x86/variance_sse2.c
+++ b/vpx_dsp/x86/variance_sse2.c
@@ -320,11 +320,11 @@
int height, unsigned int *sse, \
void *unused0, void *unused)
#define DECLS(opt1, opt2) \
- DECL(4, opt2); \
+ DECL(4, opt1); \
DECL(8, opt1); \
DECL(16, opt1)
-DECLS(sse2, sse);
+DECLS(sse2, sse2);
DECLS(ssse3, ssse3);
#undef DECLS
#undef DECL
@@ -380,10 +380,10 @@
FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t)); \
FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t)); \
FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t)); \
-FN(4, 8, 4, 2, 3, opt2, (int32_t), (int32_t)); \
-FN(4, 4, 4, 2, 2, opt2, (int32_t), (int32_t))
+FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t)); \
+FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t))
-FNS(sse2, sse);
+FNS(sse2, sse2);
FNS(ssse3, ssse3);
#undef FNS
@@ -401,11 +401,11 @@
int height, unsigned int *sse, \
void *unused0, void *unused)
#define DECLS(opt1, opt2) \
-DECL(4, opt2); \
+DECL(4, opt1); \
DECL(8, opt1); \
DECL(16, opt1)
-DECLS(sse2, sse);
+DECLS(sse2, sse2);
DECLS(ssse3, ssse3);
#undef DECL
#undef DECLS
@@ -466,8 +466,8 @@
FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t)); \
FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t)); \
FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t)); \
-FN(4, 8, 4, 2, 3, opt2, (uint32_t), (int32_t)); \
-FN(4, 4, 4, 2, 2, opt2, (uint32_t), (int32_t))
+FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t)); \
+FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t))
FNS(sse2, sse);
FNS(ssse3, ssse3);