diff --git a/aom_dsp/x86/avg_intrin_sse2.c b/aom_dsp/x86/avg_intrin_sse2.c
index 3057a71..1c5f6eb 100644
--- a/aom_dsp/x86/avg_intrin_sse2.c
+++ b/aom_dsp/x86/avg_intrin_sse2.c
@@ -409,7 +409,7 @@
 
 void aom_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref,
                           const int ref_stride, const int height) {
-  int idx;
+  int idx = 1;
   __m128i zero = _mm_setzero_si128();
   __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
   __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
@@ -417,8 +417,7 @@
   __m128i t0, t1;
   int height_1 = height - 1;
   ref += ref_stride;
-
-  for (idx = 1; idx < height_1; idx += 2) {
+  do {
     src_line = _mm_loadu_si128((const __m128i *)ref);
     t0 = _mm_unpacklo_epi8(src_line, zero);
     t1 = _mm_unpackhi_epi8(src_line, zero);
@@ -432,7 +431,8 @@
     s0 = _mm_adds_epu16(s0, t0);
     s1 = _mm_adds_epu16(s1, t1);
     ref += ref_stride;
-  }
+    idx += 2;
+  } while (idx < height_1);
 
   src_line = _mm_loadu_si128((const __m128i *)ref);
   t0 = _mm_unpacklo_epi8(src_line, zero);
@@ -449,6 +449,7 @@
     s0 = _mm_srai_epi16(s0, 4);
     s1 = _mm_srai_epi16(s1, 4);
   } else {
+    assert(height == 16);
     s0 = _mm_srai_epi16(s0, 3);
     s1 = _mm_srai_epi16(s1, 3);
   }
@@ -460,14 +461,14 @@
 
 int16_t aom_int_pro_col_sse2(const uint8_t *ref, const int width) {
   __m128i zero = _mm_setzero_si128();
-  __m128i src_line = _mm_load_si128((const __m128i *)ref);
+  __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
   __m128i s0 = _mm_sad_epu8(src_line, zero);
   __m128i s1;
   int i;
 
   for (i = 16; i < width; i += 16) {
     ref += 16;
-    src_line = _mm_load_si128((const __m128i *)ref);
+    src_line = _mm_loadu_si128((const __m128i *)ref);
     s1 = _mm_sad_epu8(src_line, zero);
     s0 = _mm_adds_epu16(s0, s1);
   }
diff --git a/test/avg_test.cc b/test/avg_test.cc
index 0dc06bd..cbd3461 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -42,7 +42,7 @@
   static const int kDataBlockSize = 64 * 128;
 
   virtual void SetUp() {
-    source_data_ = reinterpret_cast<Pixel *>(
+    source_data_ = static_cast<Pixel *>(
         aom_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
     ASSERT_TRUE(source_data_ != NULL);
     source_stride_ = (width_ + 31) & ~31;
@@ -138,6 +138,7 @@
 typedef void (*IntProRowFunc)(int16_t hbuf[16], uint8_t const *ref,
                               const int ref_stride, const int height);
 
+// Params: height, asm function, c function.
 typedef std::tuple<int, IntProRowFunc, IntProRowFunc> IntProRowParam;
 
 class IntProRowTest : public AverageTestBase<uint8_t>,
@@ -151,13 +152,13 @@
 
  protected:
   virtual void SetUp() {
-    source_data_ = reinterpret_cast<uint8_t *>(
+    source_data_ = static_cast<uint8_t *>(
         aom_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
     ASSERT_TRUE(source_data_ != NULL);
 
-    hbuf_asm_ = reinterpret_cast<int16_t *>(
+    hbuf_asm_ = static_cast<int16_t *>(
         aom_memalign(kDataAlignment, sizeof(*hbuf_asm_) * 16));
-    hbuf_c_ = reinterpret_cast<int16_t *>(
+    hbuf_c_ = static_cast<int16_t *>(
         aom_memalign(kDataAlignment, sizeof(*hbuf_c_) * 16));
   }
 
@@ -186,6 +187,7 @@
 
 typedef int16_t (*IntProColFunc)(uint8_t const *ref, const int width);
 
+// Params: width, asm function, c function.
 typedef std::tuple<int, IntProColFunc, IntProColFunc> IntProColParam;
 
 class IntProColTest : public AverageTestBase<uint8_t>,
