Properly scale diff in temporal filter

Scale the pixel difference in temporal filtering according to its
bit depth. This resolves a compression performance regression in
12 bit setting.

BUG=aomedia:2780

STATS_CHANGED

Change-Id: Id858228458a1198b558596e3c77ef4fe18d1e461
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index d98439c..f7abc4d 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -582,7 +582,7 @@
         }
 
         // Scale down the difference for high bit depth input.
-        if (mbd->bd > 8) sum_square_diff >>= (mbd->bd - 8) * (mbd->bd - 8);
+        if (mbd->bd > 8) sum_square_diff >>= ((mbd->bd - 8) * 2);
 
         // Combine window error and block error, and normalize it.
         const double window_error = (double)sum_square_diff / num_ref_pixels;
diff --git a/av1/encoder/x86/highbd_temporal_filter_sse2.c b/av1/encoder/x86/highbd_temporal_filter_sse2.c
index 8e6e43d..b7f8533 100644
--- a/av1/encoder/x86/highbd_temporal_filter_sse2.c
+++ b/av1/encoder/x86/highbd_temporal_filter_sse2.c
@@ -214,7 +214,7 @@
       }
 
       // Scale down the difference for high bit depth input.
-      diff_sse >>= (bd - 8) * (bd - 8);
+      diff_sse >>= ((bd - 8) * 2);
 
       const double window_error = (double)(diff_sse) / num_ref_pixels;
       const int subblock_idx =