Fix av1_inv_txfm2d_add_NxM_neon for arm 32bit

BUG=aomedia:2698

Change-Id: I24731268a629ebe1c9e63632ce23d0b1ad78260d
diff --git a/av1/common/arm/highbd_inv_txfm_neon.c b/av1/common/arm/highbd_inv_txfm_neon.c
index 7234459..eaa5e1e 100644
--- a/av1/common/arm/highbd_inv_txfm_neon.c
+++ b/av1/common/arm/highbd_inv_txfm_neon.c
@@ -629,7 +629,7 @@
   u0 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u0x.val[0]),
                                         vreinterpretq_s64_s32(u0x.val[1])));
 #else
-  u0 = vcombine_s32(vget_low_s32(u0x.val[0]), vget_high_s32(u0x.val[1]));
+  u0 = vcombine_s32(vget_low_s32(u0x.val[0]), vget_low_s32(u0x.val[1]));
 #endif  // (__aarch64__)
   // u1
   int32x4x2_t u1x;
@@ -654,7 +654,7 @@
   u1 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u1x.val[0]),
                                         vreinterpretq_s64_s32(u1x.val[1])));
 #else
-  u1 = vcombine_s32(vget_low_s32(u1x.val[0]), vget_high_s32(u1x.val[1]));
+  u1 = vcombine_s32(vget_low_s32(u1x.val[0]), vget_low_s32(u1x.val[1]));
 #endif  // (__aarch64__)
 
   // u2
@@ -680,7 +680,7 @@
   u2 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u2x.val[0]),
                                         vreinterpretq_s64_s32(u2x.val[1])));
 #else
-  u2 = vcombine_s32(vget_low_s32(u2x.val[0]), vget_high_s32(u2x.val[1]));
+  u2 = vcombine_s32(vget_low_s32(u2x.val[0]), vget_low_s32(u2x.val[1]));
 #endif  // (__aarch64__)
 
   // u3
@@ -706,7 +706,7 @@
   u3 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u3x.val[0]),
                                         vreinterpretq_s64_s32(u3x.val[1])));
 #else
-  u3 = vcombine_s32(vget_low_s32(u3x.val[0]), vget_high_s32(u3x.val[1]));
+  u3 = vcombine_s32(vget_low_s32(u3x.val[0]), vget_low_s32(u3x.val[1]));
 #endif  // (__aarch64__)
 
   out[0] = u0;
@@ -2771,7 +2771,7 @@
         vmull_s32(vmovn_s64(vreinterpretq_s64_s32(in[i])), fact));
     a0.val[0] = vreinterpretq_s32_s64(
         vrshrq_n_s64(vreinterpretq_s64_s32(a0.val[0]), NewSqrt2Bits));
-    a0.val[1] = vextq_s32(in[i], zero, 1);  // 4
+    a0.val[1] = vextq_s32(in[i], zero, 1);
     a0.val[1] = vreinterpretq_s32_s64(
         vmull_s32(vmovn_s64(vreinterpretq_s64_s32(a0.val[1])), fact));
     a0.val[1] = vreinterpretq_s32_s64(