Saving 3 instructions per line from od_filter_dering_direction_8x8()

Change-Id: I7ca73e03ed171b57a05dd1fd5957906e11b66728
diff --git a/av1/common/od_dering_simd.h b/av1/common/od_dering_simd.h
index 4f5406e..3435d7c 100644
--- a/av1/common/od_dering_simd.h
+++ b/av1/common/od_dering_simd.h
@@ -288,7 +288,7 @@
                                               int dir) {
   int i;
   v128 sum;
-  v128 p;
+  v128 p0, p1;
   v128 cmp;
   v128 row;
   v128 res;
@@ -302,53 +302,53 @@
   thresh = v128_dup_16(threshold);
   for (i = 0; i < 8; i++) {
     sum = v128_zero();
-    row = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE]);
+    row = v128_load_aligned(&in[i * OD_FILT_BSTRIDE]);
 
-    /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
-    p = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + off1]), row);
-    /*if (abs(p) < thresh) sum += taps[k]*p*/
-    cmp = od_cmplt_abs_epi16(p, thresh);
-    p = v128_add_16(p, v128_shl_n_16(p, 1));
-    p = v128_and(p, cmp);
-    sum = v128_add_16(sum, p);
+    /*p0 = in[i*OD_FILT_BSTRIDE + offset] - row*/
+    p0 = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + off1]), row);
+    /*p0 = abs(p0) < thresh ? p0 : 0*/
+    cmp = od_cmplt_abs_epi16(p0, thresh);
+    p0 = v128_and(p0, cmp);
 
-    /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
-    p = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - off1]), row);
-    /*if (abs(p) < thresh) sum += taps[k]*p1*/
-    cmp = od_cmplt_abs_epi16(p, thresh);
-    p = v128_add_16(p, v128_shl_n_16(p, 1));
-    p = v128_and(p, cmp);
-    sum = v128_add_16(sum, p);
+    /*p1 = in[i*OD_FILT_BSTRIDE - offset] - row*/
+    p1 = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - off1]), row);
+    /*p1 = abs(p1) < thresh ? p1 : 0*/
+    cmp = od_cmplt_abs_epi16(p1, thresh);
+    p1 = v128_and(p1, cmp);
+    /*sum += 3*(p0 + p1)*/
+    p0 = v128_add_16(p0, p1);
+    p0 = v128_add_16(p0, v128_shl_n_16(p0, 1));
+    sum = v128_add_16(sum, p0);
 
-    /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
-    p = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + off2]), row);
-    /*if (abs(p) < thresh) sum += taps[k]*p*/
-    cmp = od_cmplt_abs_epi16(p, thresh);
-    p = v128_shl_n_16(p, 1);
-    p = v128_and(p, cmp);
-    sum = v128_add_16(sum, p);
+    /*p0 = in[i*OD_FILT_BSTRIDE + offset] - row*/
+    p0 = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + off2]), row);
+    /*p0 = abs(p0) < thresh ? p0 : 0*/
+    cmp = od_cmplt_abs_epi16(p0, thresh);
+    p0 = v128_and(p0, cmp);
 
-    /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
-    p = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - off2]), row);
-    /*if (abs(p) < thresh) sum += taps[k]*p1*/
-    cmp = od_cmplt_abs_epi16(p, thresh);
-    p = v128_shl_n_16(p, 1);
-    p = v128_and(p, cmp);
-    sum = v128_add_16(sum, p);
+    /*p1 = in[i*OD_FILT_BSTRIDE - offset] - row*/
+    p1 = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - off2]), row);
+    /*p1 = abs(p1) < thresh ? p1 : 0*/
+    cmp = od_cmplt_abs_epi16(p1, thresh);
+    p1 = v128_and(p1, cmp);
+    /* sum += 2*(p0 + p1)*/
+    p0 = v128_shl_n_16(v128_add_16(p0, p1), 1);
+    sum = v128_add_16(sum, p0);
 
-    /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
-    p = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + off3]), row);
-    /*if (abs(p) < thresh) sum += taps[k]*p*/
-    cmp = od_cmplt_abs_epi16(p, thresh);
-    p = v128_and(p, cmp);
-    sum = v128_add_16(sum, p);
+    /*p0 = in[i*OD_FILT_BSTRIDE + offset] - row*/
+    p0 = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + off3]), row);
+    /*p0 = abs(p0) < thresh ? p0 : 0*/
+    cmp = od_cmplt_abs_epi16(p0, thresh);
+    p0 = v128_and(p0, cmp);
 
-    /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
-    p = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - off3]), row);
-    /*if (abs(p) < thresh) sum += taps[k]*p1*/
-    cmp = od_cmplt_abs_epi16(p, thresh);
-    p = v128_and(p, cmp);
-    sum = v128_add_16(sum, p);
+    /*p1 = in[i*OD_FILT_BSTRIDE - offset] - row*/
+    p1 = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - off3]), row);
+    /*p1 = abs(p1) < thresh ? p1 : 0*/
+    cmp = od_cmplt_abs_epi16(p1, thresh);
+    p1 = v128_and(p1, cmp);
+    /*sum += (p0 + p1)*/
+    p0 = v128_add_16(p0, p1);
+    sum = v128_add_16(sum, p0);
 
     /*res = row + ((sum + 8) >> 4)*/
     res = v128_add_16(sum, v128_dup_16(8));