SMOOTH_PRED: Use 12-bit multiplications instead of 18-bit.

Compression performance is roughly neutral:

AWCY:
-----
                 High Latency     Low Latency
  All Keyframes  0.00             0.00
  Video overall  0.01            -0.01

Google sets:
------------

- All Keyframes:

  lowres  -0.001
  midres   0.000
  hdres    0.001

- Video overall:
  lowres   0.019
  midres   0.000
  hdres   -0.013

Change-Id: I89be2739203bf3e2848e4ba7ae2988c625f54513
diff --git a/aom_dsp/intrapred.c b/aom_dsp/intrapred.c
index 50b6a08..447d72a 100644
--- a/aom_dsp/intrapred.c
+++ b/aom_dsp/intrapred.c
@@ -260,13 +260,7 @@
   }
 }
 
-// Weights are quadratic from 'bs' to '1', scaled by 2^12.
-// TODO(urvang): All weights can be at the same scale: going from '1' to '1/bs'
-// instead (still scaled by 2^12 or more).
-// Rationale: Given that max block dimension is 64 (=2^6), and max pixel value
-// is below 2^12 (for both normal and highbitdepth), power of (31 - 6 - 12 - 1)
-// = 12 is chosen so that all weighted sums in smooth_predictor() remain within
-// 2^31 (unsigned integer) range.
+// Weights are quadratic from '1' to '1 / block_size', scaled by 2^12.
 static const int sm_weight_log2_scale = 12;
 
 #if CONFIG_TX64X64
@@ -275,29 +269,26 @@
 static const uint32_t sm_weight_arrays[5][32] = {
 #endif  // CONFIG_TX64X64
   // bs = 2
-  { 8192, 4096 },
+  { 4096, 2048 },
   // bs = 4
-  { 16384, 9557, 5461, 4096 },
+  { 4096, 2389, 1365, 1024 },
   // bs = 8
-  { 32768, 25161, 18725, 13458, 9362, 6437, 4681, 4096 },
+  { 4096, 3145, 2341, 1682, 1170, 805, 585, 512 },
   // bs = 16
-  { 65536, 57617, 50244, 43418, 37137, 31403, 26214, 21572, 17476, 13926, 10923,
-    8465, 6554, 5188, 4369, 4096 },
+  { 4096, 3601, 3140, 2714, 2321, 1963, 1638, 1348, 1092, 870, 683, 529, 410,
+    324, 273, 256 },
   // bs = 32
-  { 131072, 123012, 115217, 107685, 100418, 93415, 86677, 80202,
-    73992,  68046,  62365,  56948,  51795,  46906, 42281, 37921,
-    33825,  29993,  26426,  23123,  20084,  17309, 14798, 12552,
-    10570,  8853,   7399,   6210,   5285,   4625,  4228,  4096 },
+  { 4096, 3844, 3601, 3365, 3138, 2919, 2709, 2506, 2312, 2126, 1949,
+    1780, 1619, 1466, 1321, 1185, 1057, 937,  826,  723,  628,  541,
+    462,  392,  330,  277,  231,  194,  165,  145,  132,  128 },
 #if CONFIG_TX64X64
   // bs = 64
-  { 262144, 254017, 246020, 238153, 230416, 222809, 215333, 207986,
-    200769, 193682, 186726, 179899, 173202, 166636, 160199, 153893,
-    147716, 141670, 135753, 129967, 124310, 118784, 113388, 108121,
-    102985, 97979,  93103,  88357,  83740,  79254,  74898,  70672,
-    66576,  62610,  58774,  55068,  51493,  48047,  44731,  41545,
-    38489,  35564,  32768,  30102,  27567,  25161,  22886,  20740,
-    18725,  16839,  15084,  13458,  11963,  10598,  9362,   8257,
-    7282,   6437,   5721,   5136,   4681,   4356,   4161,   4096 },
+  { 4096, 3969, 3844, 3721, 3600, 3481, 3365, 3250, 3137, 3026, 2918,
+    2811, 2706, 2604, 2503, 2405, 2308, 2214, 2121, 2031, 1942, 1856,
+    1772, 1689, 1609, 1531, 1455, 1381, 1308, 1238, 1170, 1104, 1040,
+    978,  918,  860,  805,  751,  699,  649,  601,  556,  512,  470,
+    431,  393,  358,  324,  293,  263,  236,  210,  187,  166,  146,
+    129,  114,  101,  89,   80,   73,   68,   65,   64 },
 #endif  // CONFIG_TX64X64
 };
 
@@ -307,24 +298,23 @@
                                     const uint8_t *above, const uint8_t *left) {
   const uint8_t below_pred = left[bs - 1];   // estimated by bottom-left pixel
   const uint8_t right_pred = above[bs - 1];  // estimated by top-right pixel
-  const int log2_bs = (int)lround(log2(bs));
-  const int arr_index = log2_bs - 1;
+  const int arr_index = (int)lround(log2(bs)) - 1;
   const uint32_t *const sm_weights = sm_weight_arrays[arr_index];
-  // scale = 2 * bs * 2^sm_weight_log2_scale
-  const int log2_scale = 1 + log2_bs + sm_weight_log2_scale;
+  // scale = 2 * 2^sm_weight_log2_scale
+  const int log2_scale = 1 + sm_weight_log2_scale;
   assert(log2_scale + 8 < 8 * 31);  // sanity check: no overflow.
-  const uint32_t scaled_bs = sm_weights[0];
-  assert((int)scaled_bs == (bs << sm_weight_log2_scale));
+  const uint32_t scale = sm_weights[0];
+  assert((int)scale == (1 << sm_weight_log2_scale));
   int r;
   for (r = 0; r < bs; ++r) {
     int c;
     for (c = 0; c < bs; ++c) {
       const uint8_t pixels[] = { above[c], below_pred, left[r], right_pred };
-      const uint32_t weights[] = { sm_weights[r], scaled_bs - sm_weights[r],
-                                   sm_weights[c], scaled_bs - sm_weights[c] };
+      const uint32_t weights[] = { sm_weights[r], scale - sm_weights[r],
+                                   sm_weights[c], scale - sm_weights[c] };
       uint32_t this_pred = 0;
       int i;
-      assert(scaled_bs >= sm_weights[r] && scaled_bs >= sm_weights[c]);
+      assert(scale >= sm_weights[r] && scale >= sm_weights[c]);
       for (i = 0; i < 4; ++i) {
         this_pred += weights[i] * pixels[i];
       }
@@ -1042,24 +1032,23 @@
                                            const uint16_t *left, int bd) {
   const uint16_t below_pred = left[bs - 1];   // estimated by bottom-left pixel
   const uint16_t right_pred = above[bs - 1];  // estimated by top-right pixel
-  const int log2_bs = (int)lround(log2(bs));
-  const int arr_index = log2_bs - 1;
+  const int arr_index = (int)lround(log2(bs)) - 1;
   const uint32_t *const sm_weights = sm_weight_arrays[arr_index];
-  // scale = 2 * bs * 2^sm_weight_log2_scale
-  const int log2_scale = 1 + log2_bs + sm_weight_log2_scale;
+  // scale = 2 * 2^sm_weight_log2_scale
+  const int log2_scale = 1 + sm_weight_log2_scale;
   assert(log2_scale + 8 < 8 * 31);  // sanity check: no overflow.
-  const uint32_t scaled_bs = sm_weights[0];
-  assert((int)scaled_bs == (bs << sm_weight_log2_scale));
+  const uint32_t scale = sm_weights[0];
+  assert((int)scale == (1 << sm_weight_log2_scale));
   int r;
   for (r = 0; r < bs; ++r) {
     int c;
     for (c = 0; c < bs; ++c) {
       const uint16_t pixels[] = { above[c], below_pred, left[r], right_pred };
-      const uint32_t weights[] = { sm_weights[r], scaled_bs - sm_weights[r],
-                                   sm_weights[c], scaled_bs - sm_weights[c] };
+      const uint32_t weights[] = { sm_weights[r], scale - sm_weights[r],
+                                   sm_weights[c], scale - sm_weights[c] };
       uint32_t this_pred = 0;
       int i;
-      assert(scaled_bs >= sm_weights[r] && scaled_bs >= sm_weights[c]);
+      assert(scale >= sm_weights[r] && scale >= sm_weights[c]);
       for (i = 0; i < 4; ++i) {
         this_pred += weights[i] * pixels[i];
       }
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 5c8c554..5046bc9 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -179,7 +179,7 @@
     "f162b51ed618d28b936974cff4391da5",
 #if CONFIG_ALT_INTRA
     "297e8fbb5d33c29b12b228fa9d7c40a4",
-    "7177dd1ae3b49441f997d439a5bd451a"
+    "a08d5b7e104c5fc2b203789ee5f725a7"
 #else
     "9e1370c6d42e08d357d9612c93a71cfc",
 #endif  // CONFIG_ALT_INTRA