Reduce tx_size feature calculation
Rewrote the feature calculation for tx_size split prediction, and
reduced the complexity by eliminating 1 level mean/sse calculation.
get_blk_sse_sum_c can be written to SIMD to provide further speedup.
This CL didn't introduce bitstream change(borg test verified that).
Average speed 1 speedup over midres set is 0.2%(instruction count
result).
Change-Id: I188cd5dfe6d1d1de07ba60394499bc938c38e4b6
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index bb5ee96..cc73126 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -4944,90 +4944,71 @@
}
}
-static void get_mean_and_dev(const int16_t *data, int stride, int bw, int bh,
- float *mean, float *dev) {
- int x_sum = 0;
- uint64_t x2_sum = 0;
+static void get_blk_sse_sum_c(const int16_t *data, int stride, int bw, int bh,
+ int *x_sum, int *x2_sum) {
+ *x_sum = 0;
+ *x2_sum = 0;
for (int i = 0; i < bh; ++i) {
for (int j = 0; j < bw; ++j) {
const int val = data[j];
- x_sum += val;
- x2_sum += val * val;
+ *x_sum += val;
+ *x2_sum += val * val;
}
data += stride;
}
-
- const int num = bw * bh;
- const float e_x = (float)x_sum / num;
- const float e_x2 = (float)((double)x2_sum / num);
- const float diff = e_x2 - e_x * e_x;
- *dev = (diff > 0) ? sqrtf(diff) : 0;
- *mean = e_x;
}
-static void get_mean_and_dev_float(const float *data, int stride, int bw,
- int bh, float *mean, float *dev) {
- float x_sum = 0;
- float x2_sum = 0;
- for (int i = 0; i < bh; ++i) {
- for (int j = 0; j < bw; ++j) {
- const float val = data[j];
- x_sum += val;
- x2_sum += val * val;
- }
- data += stride;
- }
-
- const int num = bw * bh;
- const float e_x = x_sum / num;
- const float e_x2 = x2_sum / num;
- const float diff = e_x2 - e_x * e_x;
- *dev = (diff > 0) ? sqrtf(diff) : 0;
- *mean = e_x;
+static float get_dev(float mean, float x2_sum, int num) {
+ const float e_x2 = (float)((double)x2_sum / num);
+ const float diff = e_x2 - mean * mean;
+ const float dev = (diff > 0) ? sqrtf(diff) : 0;
+ return dev;
}
// Feature used by the model to predict tx split: the mean and standard
// deviation values of the block and sub-blocks.
static void get_mean_dev_features(const int16_t *data, int stride, int bw,
- int bh, int levels, float *feature) {
- int feature_idx = 0;
- int width = bw;
- int height = bh;
+ int bh, float *feature) {
const int16_t *const data_ptr = &data[0];
- for (int lv = 0; lv < levels; ++lv) {
- if (width < 2 || height < 2) break;
- float mean_buf[16];
- float dev_buf[16];
- int blk_idx = 0;
- for (int row = 0; row < bh; row += height) {
- for (int col = 0; col < bw; col += width) {
- float mean, dev;
- get_mean_and_dev(data_ptr + row * stride + col, stride, width, height,
- &mean, &dev);
- feature[feature_idx++] = mean;
- feature[feature_idx++] = dev;
- mean_buf[blk_idx] = mean;
- dev_buf[blk_idx++] = dev;
- }
- }
- if (blk_idx > 1) {
- float mean, dev;
- // Deviation of means.
- get_mean_and_dev_float(mean_buf, 1, 1, blk_idx, &mean, &dev);
- feature[feature_idx++] = dev;
- // Mean of deviations.
- get_mean_and_dev_float(dev_buf, 1, 1, blk_idx, &mean, &dev);
+ const int subh = (bh >= bw) ? (bh >> 1) : bh;
+ const int subw = (bw >= bh) ? (bw >> 1) : bw;
+ const int num = bw * bh;
+ const int sub_num = subw * subh;
+ int feature_idx = 2;
+ int total_x_sum = 0;
+ int total_x2_sum = 0;
+ int blk_idx = 0;
+ float mean2_sum = 0.0f;
+ float dev_sum = 0.0f;
+
+ for (int row = 0; row < bh; row += subh) {
+ for (int col = 0; col < bw; col += subw) {
+ int x_sum, x2_sum;
+ // TODO(any): Write a SIMD version. Clear registers.
+ get_blk_sse_sum_c(data_ptr + row * stride + col, stride, subw, subh,
+ &x_sum, &x2_sum);
+ total_x_sum += x_sum;
+ total_x2_sum += x2_sum;
+
+ const float mean = (float)x_sum / sub_num;
+ const float dev = get_dev(mean, (float)x2_sum, sub_num);
feature[feature_idx++] = mean;
+ feature[feature_idx++] = dev;
+ mean2_sum += mean * mean;
+ dev_sum += dev;
+ blk_idx++;
}
- // Reduce the block size when proceeding to the next level.
- if (height == width) {
- height = height >> 1;
- width = width >> 1;
- } else if (height > width) {
- height = height >> 1;
- } else {
- width = width >> 1;
- }
+ }
+
+ const float lvl0_mean = (float)total_x_sum / num;
+ feature[0] = lvl0_mean;
+ feature[1] = get_dev(lvl0_mean, (float)total_x2_sum, num);
+
+ if (blk_idx > 1) {
+ // Deviation of means.
+ feature[feature_idx++] = get_dev(lvl0_mean, mean2_sum, blk_idx);
+ // Mean of deviations.
+ feature[feature_idx++] = dev_sum / blk_idx;
}
}
@@ -5044,11 +5025,12 @@
aom_clear_system_state();
float features[64] = { 0.0f };
- get_mean_dev_features(diff, diff_stride, bw, bh, 2, features);
+ get_mean_dev_features(diff, diff_stride, bw, bh, features);
float score = 0.0f;
av1_nn_predict(features, nn_config, 1, &score);
aom_clear_system_state();
+
if (score > 8.0f) return 100;
if (score < -8.0f) return 0;
score = 1.0f / (1.0f + (float)exp(-score));