Merge changes I2aa2a545,I63932eda,Ie3694ecd * changes: ssim: Add missing statics and consts psnrhvs: Add missing consts and static consts. ssim: Replace unsigned long with uint32_t.
diff --git a/vpx_dsp/psnrhvs.c b/vpx_dsp/psnrhvs.c index 8aa30f2..4d723e4 100644 --- a/vpx_dsp/psnrhvs.c +++ b/vpx_dsp/psnrhvs.c
@@ -23,7 +23,8 @@ #endif #include <string.h> -void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x, int xstride) { +static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x, + int xstride) { (void) xstride; vpx_fdct8x8(x, y, ystride); } @@ -31,56 +32,57 @@ /* Normalized inverse quantization matrix for 8x8 DCT at the point of * transparency. This is not the JPEG based matrix from the paper, this one gives a slightly higher MOS agreement.*/ -float csf_y[8][8] = {{1.6193873005, 2.2901594831, 2.08509755623, 1.48366094411, - 1.00227514334, 0.678296995242, 0.466224900598, 0.3265091542}, {2.2901594831, - 1.94321815382, 2.04793073064, 1.68731108984, 1.2305666963, 0.868920337363, - 0.61280991668, 0.436405793551}, {2.08509755623, 2.04793073064, - 1.34329019223, 1.09205635862, 0.875748795257, 0.670882927016, - 0.501731932449, 0.372504254596}, {1.48366094411, 1.68731108984, - 1.09205635862, 0.772819797575, 0.605636379554, 0.48309405692, - 0.380429446972, 0.295774038565}, {1.00227514334, 1.2305666963, - 0.875748795257, 0.605636379554, 0.448996256676, 0.352889268808, - 0.283006984131, 0.226951348204}, {0.678296995242, 0.868920337363, - 0.670882927016, 0.48309405692, 0.352889268808, 0.27032073436, - 0.215017739696, 0.17408067321}, {0.466224900598, 0.61280991668, - 0.501731932449, 0.380429446972, 0.283006984131, 0.215017739696, - 0.168869545842, 0.136153931001}, {0.3265091542, 0.436405793551, - 0.372504254596, 0.295774038565, 0.226951348204, 0.17408067321, - 0.136153931001, 0.109083846276}}; -float csf_cb420[8][8] = { +static const float csf_y[8][8] = { + {1.6193873005, 2.2901594831, 2.08509755623, 1.48366094411, 1.00227514334, + 0.678296995242, 0.466224900598, 0.3265091542}, + {2.2901594831, 1.94321815382, 2.04793073064, 1.68731108984, 1.2305666963, + 0.868920337363, 0.61280991668, 0.436405793551}, + {2.08509755623, 2.04793073064, 1.34329019223, 1.09205635862, 0.875748795257, + 0.670882927016, 0.501731932449, 0.372504254596}, + {1.48366094411, 1.68731108984, 1.09205635862, 0.772819797575, + 0.605636379554, 0.48309405692, 0.380429446972, 0.295774038565}, + {1.00227514334, 1.2305666963, 0.875748795257, 0.605636379554, + 0.448996256676, 0.352889268808, 0.283006984131, 0.226951348204}, + {0.678296995242, 0.868920337363, 0.670882927016, 0.48309405692, + 0.352889268808, 0.27032073436, 0.215017739696, 0.17408067321}, + {0.466224900598, 0.61280991668, 0.501731932449, 0.380429446972, + 0.283006984131, 0.215017739696, 0.168869545842, 0.136153931001}, + {0.3265091542, 0.436405793551, 0.372504254596, 0.295774038565, + 0.226951348204, 0.17408067321, 0.136153931001, 0.109083846276}}; +static const float csf_cb420[8][8] = { {1.91113096927, 2.46074210438, 1.18284184739, 1.14982565193, 1.05017074788, - 0.898018824055, 0.74725392039, 0.615105596242}, {2.46074210438, - 1.58529308355, 1.21363250036, 1.38190029285, 1.33100189972, - 1.17428548929, 0.996404342439, 0.830890433625}, {1.18284184739, - 1.21363250036, 0.978712413627, 1.02624506078, 1.03145147362, - 0.960060382087, 0.849823426169, 0.731221236837}, {1.14982565193, - 1.38190029285, 1.02624506078, 0.861317501629, 0.801821139099, - 0.751437590932, 0.685398513368, 0.608694761374}, {1.05017074788, - 1.33100189972, 1.03145147362, 0.801821139099, 0.676555426187, - 0.605503172737, 0.55002013668, 0.495804539034}, {0.898018824055, - 1.17428548929, 0.960060382087, 0.751437590932, 0.605503172737, - 0.514674450957, 0.454353482512, 0.407050308965}, {0.74725392039, - 0.996404342439, 0.849823426169, 0.685398513368, 0.55002013668, - 0.454353482512, 0.389234902883, 0.342353999733}, {0.615105596242, - 0.830890433625, 0.731221236837, 0.608694761374, 0.495804539034, - 0.407050308965, 0.342353999733, 0.295530605237}}; -float csf_cr420[8][8] = { + 0.898018824055, 0.74725392039, 0.615105596242}, + {2.46074210438, 1.58529308355, 1.21363250036, 1.38190029285, 1.33100189972, + 1.17428548929, 0.996404342439, 0.830890433625}, + {1.18284184739, 1.21363250036, 0.978712413627, 1.02624506078, 1.03145147362, + 0.960060382087, 0.849823426169, 0.731221236837}, + {1.14982565193, 1.38190029285, 1.02624506078, 0.861317501629, + 0.801821139099, 0.751437590932, 0.685398513368, 0.608694761374}, + {1.05017074788, 1.33100189972, 1.03145147362, 0.801821139099, + 0.676555426187, 0.605503172737, 0.55002013668, 0.495804539034}, + {0.898018824055, 1.17428548929, 0.960060382087, 0.751437590932, + 0.605503172737, 0.514674450957, 0.454353482512, 0.407050308965}, + {0.74725392039, 0.996404342439, 0.849823426169, 0.685398513368, + 0.55002013668, 0.454353482512, 0.389234902883, 0.342353999733}, + {0.615105596242, 0.830890433625, 0.731221236837, 0.608694761374, + 0.495804539034, 0.407050308965, 0.342353999733, 0.295530605237}}; +static const float csf_cr420[8][8] = { {2.03871978502, 2.62502345193, 1.26180942886, 1.11019789803, 1.01397751469, - 0.867069376285, 0.721500455585, 0.593906509971}, {2.62502345193, - 1.69112867013, 1.17180569821, 1.3342742857, 1.28513006198, - 1.13381474809, 0.962064122248, 0.802254508198}, {1.26180942886, - 1.17180569821, 0.944981930573, 0.990876405848, 0.995903384143, - 0.926972725286, 0.820534991409, 0.706020324706}, {1.11019789803, - 1.3342742857, 0.990876405848, 0.831632933426, 0.77418706195, - 0.725539939514, 0.661776842059, 0.587716619023}, {1.01397751469, - 1.28513006198, 0.995903384143, 0.77418706195, 0.653238524286, - 0.584635025748, 0.531064164893, 0.478717061273}, {0.867069376285, - 1.13381474809, 0.926972725286, 0.725539939514, 0.584635025748, - 0.496936637883, 0.438694579826, 0.393021669543}, {0.721500455585, - 0.962064122248, 0.820534991409, 0.661776842059, 0.531064164893, - 0.438694579826, 0.375820256136, 0.330555063063}, {0.593906509971, - 0.802254508198, 0.706020324706, 0.587716619023, 0.478717061273, - 0.393021669543, 0.330555063063, 0.285345396658}}; + 0.867069376285, 0.721500455585, 0.593906509971}, + {2.62502345193, 1.69112867013, 1.17180569821, 1.3342742857, 1.28513006198, + 1.13381474809, 0.962064122248, 0.802254508198}, + {1.26180942886, 1.17180569821, 0.944981930573, 0.990876405848, + 0.995903384143, 0.926972725286, 0.820534991409, 0.706020324706}, + {1.11019789803, 1.3342742857, 0.990876405848, 0.831632933426, 0.77418706195, + 0.725539939514, 0.661776842059, 0.587716619023}, + {1.01397751469, 1.28513006198, 0.995903384143, 0.77418706195, + 0.653238524286, 0.584635025748, 0.531064164893, 0.478717061273}, + {0.867069376285, 1.13381474809, 0.926972725286, 0.725539939514, + 0.584635025748, 0.496936637883, 0.438694579826, 0.393021669543}, + {0.721500455585, 0.962064122248, 0.820534991409, 0.661776842059, + 0.531064164893, 0.438694579826, 0.375820256136, 0.330555063063}, + {0.593906509971, 0.802254508198, 0.706020324706, 0.587716619023, + 0.478717061273, 0.393021669543, 0.330555063063, 0.285345396658}}; static double convert_score_db(double _score, double _weight) { return 10 * (log10(255 * 255) - log10(_weight * _score)); @@ -89,7 +91,7 @@ static double calc_psnrhvs(const unsigned char *_src, int _systride, const unsigned char *_dst, int _dystride, double _par, int _w, int _h, int _step, - float _csf[8][8]) { + const float _csf[8][8]) { float ret; int16_t dct_s[8 * 8], dct_d[8 * 8]; tran_low_t dct_s_coef[8 * 8], dct_d_coef[8 * 8]; @@ -200,11 +202,12 @@ ret /= pixels; return ret; } -double vpx_psnrhvs(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, - double *y_psnrhvs, double *u_psnrhvs, double *v_psnrhvs) { +double vpx_psnrhvs(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *y_psnrhvs, + double *u_psnrhvs, double *v_psnrhvs) { double psnrhvs; - double par = 1.0; - int step = 7; + const double par = 1.0; + const int step = 7; vpx_clear_system_state(); *y_psnrhvs = calc_psnrhvs(source->y_buffer, source->y_stride, dest->y_buffer, dest->y_stride, par, source->y_crop_width,
diff --git a/vpx_dsp/ssim.c b/vpx_dsp/ssim.c index 991906f..e69c813 100644 --- a/vpx_dsp/ssim.c +++ b/vpx_dsp/ssim.c
@@ -13,10 +13,10 @@ #include "vpx_dsp/ssim.h" #include "vpx_ports/mem.h" -void vpx_ssim_parms_16x16_c(uint8_t *s, int sp, uint8_t *r, - int rp, unsigned long *sum_s, unsigned long *sum_r, - unsigned long *sum_sq_s, unsigned long *sum_sq_r, - unsigned long *sum_sxr) { +void vpx_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, + int rp, uint32_t *sum_s, uint32_t *sum_r, + uint32_t *sum_sq_s, uint32_t *sum_sq_r, + uint32_t *sum_sxr) { int i, j; for (i = 0; i < 16; i++, s += sp, r += rp) { for (j = 0; j < 16; j++) { @@ -28,10 +28,10 @@ } } } -void vpx_ssim_parms_8x8_c(uint8_t *s, int sp, uint8_t *r, int rp, - unsigned long *sum_s, unsigned long *sum_r, - unsigned long *sum_sq_s, unsigned long *sum_sq_r, - unsigned long *sum_sxr) { +void vpx_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp, + uint32_t *sum_s, uint32_t *sum_r, + uint32_t *sum_sq_s, uint32_t *sum_sq_r, + uint32_t *sum_sxr) { int i, j; for (i = 0; i < 8; i++, s += sp, r += rp) { for (j = 0; j < 8; j++) { @@ -45,7 +45,8 @@ } #if CONFIG_VP9_HIGHBITDEPTH -void vpx_highbd_ssim_parms_8x8_c(uint16_t *s, int sp, uint16_t *r, int rp, +void vpx_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, + const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr) { @@ -65,9 +66,9 @@ static const int64_t cc1 = 26634; // (64^2*(.01*255)^2 static const int64_t cc2 = 239708; // (64^2*(.03*255)^2 -static double similarity(unsigned long sum_s, unsigned long sum_r, - unsigned long sum_sq_s, unsigned long sum_sq_r, - unsigned long sum_sxr, int count) { +static double similarity(uint32_t sum_s, uint32_t sum_r, + uint32_t sum_sq_s, uint32_t sum_sq_r, + uint32_t sum_sxr, int count) { int64_t ssim_n, ssim_d; int64_t c1, c2; @@ -85,16 +86,16 @@ return ssim_n * 1.0 / ssim_d; } -static double ssim_8x8(uint8_t *s, int sp, uint8_t *r, int rp) { - unsigned long sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; +static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) { + uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; vpx_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64); } #if CONFIG_VP9_HIGHBITDEPTH -static double highbd_ssim_8x8(uint16_t *s, int sp, uint16_t *r, int rp, - unsigned int bd) { +static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r, + int rp, unsigned int bd) { uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; const int oshift = bd - 8; vpx_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, @@ -111,8 +112,9 @@ // We are using a 8x8 moving window with starting location of each 8x8 window // on the 4x4 pixel grid. Such arrangement allows the windows to overlap // block boundaries to penalize blocking artifacts. -double vpx_ssim2(uint8_t *img1, uint8_t *img2, int stride_img1, - int stride_img2, int width, int height) { +static double vpx_ssim2(const uint8_t *img1, const uint8_t *img2, + int stride_img1, int stride_img2, int width, + int height) { int i, j; int samples = 0; double ssim_total = 0; @@ -131,9 +133,9 @@ } #if CONFIG_VP9_HIGHBITDEPTH -double vpx_highbd_ssim2(uint8_t *img1, uint8_t *img2, int stride_img1, - int stride_img2, int width, int height, - unsigned int bd) { +static double vpx_highbd_ssim2(const uint8_t *img1, const uint8_t *img2, + int stride_img1, int stride_img2, int width, + int height, unsigned int bd) { int i, j; int samples = 0; double ssim_total = 0; @@ -154,7 +156,8 @@ } #endif // CONFIG_VP9_HIGHBITDEPTH -double vpx_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, +double vpx_calc_ssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *weight) { double a, b, c; double ssimv; @@ -178,7 +181,8 @@ return ssimv; } -double vpx_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, +double vpx_calc_ssimg(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *ssim_y, double *ssim_u, double *ssim_v) { double ssim_all = 0; double a, b, c; @@ -231,7 +235,7 @@ // Replace c1 with n*n * c1 for the final step that leads to this code: // The final step scales by 12 bits so we don't lose precision in the constants. -double ssimv_similarity(Ssimv *sv, int64_t n) { +static double ssimv_similarity(const Ssimv *sv, int64_t n) { // Scale the constants by number of pixels. const int64_t c1 = (cc1 * n * n) >> 12; const int64_t c2 = (cc2 * n * n) >> 12; @@ -262,7 +266,7 @@ // // 255 * 255 - (sum_s - sum_r) / count * (sum_s - sum_r) / count // -double ssimv_similarity2(Ssimv *sv, int64_t n) { +static double ssimv_similarity2(const Ssimv *sv, int64_t n) { // Scale the constants by number of pixels. const int64_t c1 = (cc1 * n * n) >> 12; const int64_t c2 = (cc2 * n * n) >> 12; @@ -278,8 +282,8 @@ return l * v; } -void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2, int img2_pitch, - Ssimv *sv) { +static void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2, + int img2_pitch, Ssimv *sv) { vpx_ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch, &sv->sum_s, &sv->sum_r, &sv->sum_sq_s, &sv->sum_sq_r, &sv->sum_sxr); @@ -448,8 +452,8 @@ #if CONFIG_VP9_HIGHBITDEPTH -double vpx_highbd_calc_ssim(YV12_BUFFER_CONFIG *source, - YV12_BUFFER_CONFIG *dest, +double vpx_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *weight, unsigned int bd) { double a, b, c; double ssimv; @@ -473,8 +477,8 @@ return ssimv; } -double vpx_highbd_calc_ssimg(YV12_BUFFER_CONFIG *source, - YV12_BUFFER_CONFIG *dest, double *ssim_y, +double vpx_highbd_calc_ssimg(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *ssim_y, double *ssim_u, double *ssim_v, unsigned int bd) { double ssim_all = 0; double a, b, c;
diff --git a/vpx_dsp/ssim.h b/vpx_dsp/ssim.h index b1579f7..d25e714 100644 --- a/vpx_dsp/ssim.h +++ b/vpx_dsp/ssim.h
@@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_ENCODER_VP9_SSIM_H_ -#define VPX_ENCODER_VP9_SSIM_H_ +#ifndef VPX_DSP_SSIM_H_ +#define VPX_DSP_SSIM_H_ #ifdef __cplusplus extern "C" { @@ -29,19 +29,19 @@ // metrics used for calculating ssim, ssim2, dssim, and ssimc typedef struct { // source sum ( over 8x8 region ) - uint64_t sum_s; + uint32_t sum_s; // reference sum (over 8x8 region ) - uint64_t sum_r; + uint32_t sum_r; // source sum squared ( over 8x8 region ) - uint64_t sum_sq_s; + uint32_t sum_sq_s; // reference sum squared (over 8x8 region ) - uint64_t sum_sq_r; + uint32_t sum_sq_r; // sum of source times reference (over 8x8 region) - uint64_t sum_sxr; + uint32_t sum_sxr; // calculated ssim score between source and reference double ssim; @@ -72,26 +72,29 @@ int img2_pitch, int width, int height, Ssimv *sv2, Metrics *m, int do_inconsistency); -double vpx_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, +double vpx_calc_ssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *weight); -double vpx_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, +double vpx_calc_ssimg(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *ssim_y, double *ssim_u, double *ssim_v); double vpx_calc_fastssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, double *ssim_y, double *ssim_u, double *ssim_v); -double vpx_psnrhvs(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, +double vpx_psnrhvs(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *ssim_y, double *ssim_u, double *ssim_v); #if CONFIG_VP9_HIGHBITDEPTH -double vpx_highbd_calc_ssim(YV12_BUFFER_CONFIG *source, - YV12_BUFFER_CONFIG *dest, +double vpx_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *weight, unsigned int bd); -double vpx_highbd_calc_ssimg(YV12_BUFFER_CONFIG *source, - YV12_BUFFER_CONFIG *dest, +double vpx_highbd_calc_ssimg(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *ssim_y, double *ssim_u, double *ssim_v, @@ -102,4 +105,4 @@ } // extern "C" #endif -#endif // VPX_ENCODER_VP9_SSIM_H_ +#endif // VPX_DSP_SSIM_H_
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 3260227..6c8420e 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -994,10 +994,10 @@ # Structured Similarity (SSIM) # if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") { - add_proto qw/void vpx_ssim_parms_8x8/, "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"; + add_proto qw/void vpx_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; specialize qw/vpx_ssim_parms_8x8/, "$sse2_x86_64"; - add_proto qw/void vpx_ssim_parms_16x16/, "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"; + add_proto qw/void vpx_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; specialize qw/vpx_ssim_parms_16x16/, "$sse2_x86_64"; }
diff --git a/vpx_dsp/x86/ssim_opt_x86_64.asm b/vpx_dsp/x86/ssim_opt_x86_64.asm index 5d05d4f..6d58321 100644 --- a/vpx_dsp/x86/ssim_opt_x86_64.asm +++ b/vpx_dsp/x86/ssim_opt_x86_64.asm
@@ -49,11 +49,11 @@ ; int sp, ; unsigned char *r, ; int rp -; unsigned long *sum_s, -; unsigned long *sum_r, -; unsigned long *sum_sq_s, -; unsigned long *sum_sq_r, -; unsigned long *sum_sxr); +; uint32_t *sum_s, +; uint32_t *sum_r, +; uint32_t *sum_sq_s, +; uint32_t *sum_sq_r, +; uint32_t *sum_sxr); ; ; TODO: Use parm passing through structure, probably don't need the pxors ; ( calling app will initialize to 0 ) could easily fit everything in sse2 @@ -139,11 +139,11 @@ ; int sp, ; unsigned char *r, ; int rp -; unsigned long *sum_s, -; unsigned long *sum_r, -; unsigned long *sum_sq_s, -; unsigned long *sum_sq_r, -; unsigned long *sum_sxr); +; uint32_t *sum_s, +; uint32_t *sum_r, +; uint32_t *sum_sq_s, +; uint32_t *sum_sq_r, +; uint32_t *sum_sxr); ; ; TODO: Use parm passing through structure, probably don't need the pxors ; ( calling app will initialize to 0 ) could easily fit everything in sse2