aom_comp_avg_pred_neon: fix assertion failure

Use an unaligned load for the 4x4 case.

Fixes failures in the SAD tests:
SADx4AvgTest.UnalignedRef/21
test_libaom: aom_dsp/arm/mem_neon.h:102: load_u8: Assertion
  `!(stride % sizeof(uint32_t))' failed.

Change-Id: Ibead1763fd73f359cf0f26a382697822e42d926d
diff --git a/aom_dsp/arm/avg_pred_neon.c b/aom_dsp/arm/avg_pred_neon.c
index a6972b7..9262427 100644
--- a/aom_dsp/arm/avg_pred_neon.c
+++ b/aom_dsp/arm/avg_pred_neon.c
@@ -61,7 +61,7 @@
 
     do {
       const uint8x16_t p = vld1q_u8(pred);
-      const uint8x16_t r = load_u8_4x4(ref, ref_stride);
+      const uint8x16_t r = load_unaligned_u8q(ref, ref_stride);
       const uint8x16_t avg = vrhaddq_u8(p, r);
 
       vst1q_u8(comp_pred, avg);
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index d305b4f..3410fb8 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -107,10 +107,6 @@
   return vreinterpret_u8_u32(a);
 }
 
-static INLINE uint8x16_t load_u8_4x4(const uint8_t *s, ptrdiff_t p) {
-  return vcombine_u8(load_u8(s, p), load_u8(s + 2 * p, p));
-}
-
 static INLINE void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
                                uint8x8_t *const s0, uint8x8_t *const s1,
                                uint8x8_t *const s2, uint8x8_t *const s3,