Add avifImageRGBToYUVLibYUV()

Add avifRGBImage::chromaDownsampling.

Use libyuv for RGB to YUV conversion when possible. For now only 8-bit
BT.601 is handled and only for some RGB and YUV layouts.
The conversion itself can be several times faster than the current
built-in floating point conversion. For example, avifImageRGBToYUV()
took 1.6s (built-in) vs 0.2s (libyuv) during a local test.

The full range loss is comparable to the precision loss between built-in
and libyuv YUV to RGB conversion, both being used by libavif already.

See also https://bugs.chromium.org/p/libyuv/issues/detail?id=936.

This change does not impact the core library. Only the tooling dealing
with non-YUV color spaces should be faster in some cases. For example,
avifenc is expected to be up to 7% faster when encoding PNG images at
AVIF_SPEED_FASTEST for available libyuv conversions.

Update CHANGELOG.md.
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7c0cb1d..8648e95 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,8 @@
 ## [Unreleased]
 
 There are incompatible ABI changes in this release. The alphaRange member was
-removed from avifImage struct. avifImageCopy() and avifImageAllocatePlanes()
+removed from avifImage struct. The chromaDownsampling member was added to the
+avifRGBImage struct. avifImageCopy() and avifImageAllocatePlanes()
 signatures changed. It is necessary to recompile your code. Also check the
 return values of avifImageCopy() and avifImageAllocatePlanes().
 
@@ -15,12 +16,19 @@
 * Update aom.cmd: v3.4.0
 * Update svt.cmd/svt.sh: v1.1.0
 * avifImageCopy() and avifImageAllocatePlanes() now return avifResult instead of
-  void to report invalid parameters or memory allocation failures
+  void to report invalid parameters or memory allocation failures.
+* avifImageRGBToYUV() now uses libyuv fast paths by default. It may slightly
+  change conversion results. The old behavior can be restored by setting
+  avifRGBImage::chromaDownsampling to AVIF_CHROMA_DOWNSAMPLING_BEST_QUALITY.
 
 ### Removed
 * alphaRange field was removed from the avifImage struct. It it presumed that
   alpha plane is always full range.
 
+### Added
+* Add avifChromaDownsampling enum
+* Add chromaDownsampling field to avifRGBImage struct
+
 ## [0.10.1] - 2022-04-11
 
 ### Changed
diff --git a/examples/avif_example_encode.c b/examples/avif_example_encode.c
index 054cbdf..36c4ca9 100644
--- a/examples/avif_example_encode.c
+++ b/examples/avif_example_encode.c
@@ -57,7 +57,7 @@
         printf("Encoding from converted RGBA\n");
 
         avifRGBImageSetDefaults(&rgb, image);
-        // Override RGB(A)->YUV(A) defaults here: depth, format, chromaUpsampling, ignoreAlpha, alphaPremultiplied, libYUVUsage, etc
+        // Override RGB(A)->YUV(A) defaults here: depth, format, chromaDownsampling, ignoreAlpha, alphaPremultiplied, libYUVUsage, etc
 
         // Alternative: set rgb.pixels and rgb.rowBytes yourself, which should match your chosen rgb.format
         // Be sure to use uint16_t* instead of uint8_t* for rgb.pixels/rgb.rowBytes if (rgb.depth > 8)
diff --git a/include/avif/avif.h b/include/avif/avif.h
index 459fa32..9975168 100644
--- a/include/avif/avif.h
+++ b/include/avif/avif.h
@@ -531,6 +531,19 @@
 //   * x/x/[1|2|5|6|9]/Limited
 //   * [1|2|5|6|9]/x/12/Limited
 
+// If libavif is built with libyuv fast paths enabled, libavif will use libyuv for conversion from
+// RGB to YUV if the following requirements are met:
+//
+// * YUV depth: 8
+// * RGB depth: 8
+// * rgb.chromaDownsampling: AVIF_CHROMA_UPSAMPLING_AUTOMATIC, AVIF_CHROMA_UPSAMPLING_FASTEST
+// * One of the following combinations (avifRGBFormat to avifPixelFormat/MC/Range):
+//   *  BGRA            to  YUV400        /  x  /[Full|Limited]
+//   *  BGRA            to [YUV420|YUV422]/[5|6]/[Full|Limited]
+//   *  BGRA            to  YUV444        /[5|6]/ Limited
+//   *  BGR             to  YUV420        /[5|6]/[Full|Limited]
+//   * [RGBA|ARGB|ABGR] to  YUV420        /[5|6]/ Limited
+
 typedef enum avifRGBFormat
 {
     AVIF_RGB_FORMAT_RGB = 0,
@@ -552,6 +565,14 @@
     AVIF_CHROMA_UPSAMPLING_BILINEAR = 4      // Uses bilinear filter (built-in)
 } avifChromaUpsampling;
 
+typedef enum avifChromaDownsampling
+{
+    AVIF_CHROMA_DOWNSAMPLING_AUTOMATIC = 0,    // Chooses best trade off of speed/quality (prefers libyuv, else uses BEST_QUALITY)
+    AVIF_CHROMA_DOWNSAMPLING_FASTEST = 1,      // Chooses speed over quality (prefers libyuv, else uses AVERAGE)
+    AVIF_CHROMA_DOWNSAMPLING_BEST_QUALITY = 2, // Chooses the best quality upsampling (avoids libyuv, uses AVERAGE)
+    AVIF_CHROMA_DOWNSAMPLING_AVERAGE = 3       // Uses floating point RGB-to-YUV conversion then averaging (built-in)
+} avifChromaDownsampling;
+
 typedef struct avifRGBImage
 {
     uint32_t width;       // must match associated avifImage
@@ -560,6 +581,8 @@
     avifRGBFormat format; // all channels are always full range
     avifChromaUpsampling chromaUpsampling; // Defaults to AVIF_CHROMA_UPSAMPLING_AUTOMATIC: How to upsample non-4:4:4 UV (ignored for 444) when converting to RGB.
                                            // Unused when converting to YUV. avifRGBImageSetDefaults() prefers quality over speed.
+    avifChromaDownsampling chromaDownsampling; // How to convert (and downsample to non-4:4:4 UV) when converting to YUV.
+                                               // Unused when converting to RGB. Defaults to AVIF_CHROMA_DOWNSAMPLING_AUTOMATIC.
     avifBool ignoreAlpha;        // Used for XRGB formats, treats formats containing alpha (such as ARGB) as if they were
                                  // RGB, treating the alpha bits as if they were all 1.
     avifBool alphaPremultiplied; // indicates if RGB value is pre-multiplied by alpha. Default: false
diff --git a/include/avif/internal.h b/include/avif/internal.h
index ace7611..fa52456 100644
--- a/include/avif/internal.h
+++ b/include/avif/internal.h
@@ -141,6 +141,12 @@
 
 // Returns:
 // * AVIF_RESULT_OK              - Converted successfully with libyuv
+// * AVIF_RESULT_NOT_IMPLEMENTED - The fast path for this combination is not implemented with libyuv, use built-in RGB conversion
+// * [any other error]           - Return error to caller
+avifResult avifImageRGBToYUVLibYUV(avifImage * image, const avifRGBImage * rgb);
+
+// Returns:
+// * AVIF_RESULT_OK              - Converted successfully with libyuv
 // * AVIF_RESULT_NOT_IMPLEMENTED - The fast path for this combination is not implemented with libyuv, use built-in YUV conversion
 // * [any other error]           - Return error to caller
 avifResult avifImageYUVToRGBLibYUV(const avifImage * image, avifRGBImage * rgb);
diff --git a/src/avif.c b/src/avif.c
index 56319a1..a4a7c0b 100644
--- a/src/avif.c
+++ b/src/avif.c
@@ -440,6 +440,7 @@
     rgb->depth = image->depth;
     rgb->format = AVIF_RGB_FORMAT_RGBA;
     rgb->chromaUpsampling = AVIF_CHROMA_UPSAMPLING_AUTOMATIC;
+    rgb->chromaDownsampling = AVIF_CHROMA_DOWNSAMPLING_AUTOMATIC;
     rgb->ignoreAlpha = AVIF_FALSE;
     rgb->pixels = NULL;
     rgb->rowBytes = 0;
diff --git a/src/reformat.c b/src/reformat.c
index 1e2f89a..fbd73da 100644
--- a/src/reformat.c
+++ b/src/reformat.c
@@ -208,172 +208,160 @@
         }
     }
 
-    const float kr = state.kr;
-    const float kg = state.kg;
-    const float kb = state.kb;
+    avifBool convertedWithLibYUV = AVIF_FALSE;
+    if (alphaMode == AVIF_ALPHA_MULTIPLY_MODE_NO_OP) {
+        avifResult libyuvResult = avifImageRGBToYUVLibYUV(image, rgb);
+        if (libyuvResult == AVIF_RESULT_OK) {
+            convertedWithLibYUV = AVIF_TRUE;
+        } else if (libyuvResult != AVIF_RESULT_NOT_IMPLEMENTED) {
+            return libyuvResult;
+        }
+    }
 
-    struct YUVBlock yuvBlock[2][2];
-    float rgbPixel[3];
-    const float rgbMaxChannelF = state.rgbMaxChannelF;
-    uint8_t ** yuvPlanes = image->yuvPlanes;
-    uint32_t * yuvRowBytes = image->yuvRowBytes;
-    for (uint32_t outerJ = 0; outerJ < image->height; outerJ += 2) {
-        for (uint32_t outerI = 0; outerI < image->width; outerI += 2) {
-            int blockW = 2, blockH = 2;
-            if ((outerI + 1) >= image->width) {
-                blockW = 1;
-            }
-            if ((outerJ + 1) >= image->height) {
-                blockH = 1;
-            }
+    if (!convertedWithLibYUV) {
+        const float kr = state.kr;
+        const float kg = state.kg;
+        const float kb = state.kb;
 
-            // Convert an entire 2x2 block to YUV, and populate any fully sampled channels as we go
-            for (int bJ = 0; bJ < blockH; ++bJ) {
-                for (int bI = 0; bI < blockW; ++bI) {
-                    int i = outerI + bI;
-                    int j = outerJ + bJ;
+        struct YUVBlock yuvBlock[2][2];
+        float rgbPixel[3];
+        const float rgbMaxChannelF = state.rgbMaxChannelF;
+        uint8_t ** yuvPlanes = image->yuvPlanes;
+        uint32_t * yuvRowBytes = image->yuvRowBytes;
+        for (uint32_t outerJ = 0; outerJ < image->height; outerJ += 2) {
+            for (uint32_t outerI = 0; outerI < image->width; outerI += 2) {
+                int blockW = 2, blockH = 2;
+                if ((outerI + 1) >= image->width) {
+                    blockW = 1;
+                }
+                if ((outerJ + 1) >= image->height) {
+                    blockH = 1;
+                }
 
-                    // Unpack RGB into normalized float
-                    if (state.rgbChannelBytes > 1) {
-                        rgbPixel[0] =
-                            *((uint16_t *)(&rgb->pixels[state.rgbOffsetBytesR + (i * state.rgbPixelBytes) + (j * rgb->rowBytes)])) /
-                            rgbMaxChannelF;
-                        rgbPixel[1] =
-                            *((uint16_t *)(&rgb->pixels[state.rgbOffsetBytesG + (i * state.rgbPixelBytes) + (j * rgb->rowBytes)])) /
-                            rgbMaxChannelF;
-                        rgbPixel[2] =
-                            *((uint16_t *)(&rgb->pixels[state.rgbOffsetBytesB + (i * state.rgbPixelBytes) + (j * rgb->rowBytes)])) /
-                            rgbMaxChannelF;
-                    } else {
-                        rgbPixel[0] = rgb->pixels[state.rgbOffsetBytesR + (i * state.rgbPixelBytes) + (j * rgb->rowBytes)] / rgbMaxChannelF;
-                        rgbPixel[1] = rgb->pixels[state.rgbOffsetBytesG + (i * state.rgbPixelBytes) + (j * rgb->rowBytes)] / rgbMaxChannelF;
-                        rgbPixel[2] = rgb->pixels[state.rgbOffsetBytesB + (i * state.rgbPixelBytes) + (j * rgb->rowBytes)] / rgbMaxChannelF;
-                    }
+                // Convert an entire 2x2 block to YUV, and populate any fully sampled channels as we go
+                for (int bJ = 0; bJ < blockH; ++bJ) {
+                    for (int bI = 0; bI < blockW; ++bI) {
+                        int i = outerI + bI;
+                        int j = outerJ + bJ;
 
-                    if (alphaMode != AVIF_ALPHA_MULTIPLY_MODE_NO_OP) {
-                        float a;
+                        // Unpack RGB into normalized float
                         if (state.rgbChannelBytes > 1) {
-                            a = *((uint16_t *)(&rgb->pixels[state.rgbOffsetBytesA + (i * state.rgbPixelBytes) + (j * rgb->rowBytes)])) /
+                            rgbPixel[0] =
+                                *((uint16_t *)(&rgb->pixels[state.rgbOffsetBytesR + (i * state.rgbPixelBytes) + (j * rgb->rowBytes)])) /
+                                rgbMaxChannelF;
+                            rgbPixel[1] =
+                                *((uint16_t *)(&rgb->pixels[state.rgbOffsetBytesG + (i * state.rgbPixelBytes) + (j * rgb->rowBytes)])) /
+                                rgbMaxChannelF;
+                            rgbPixel[2] =
+                                *((uint16_t *)(&rgb->pixels[state.rgbOffsetBytesB + (i * state.rgbPixelBytes) + (j * rgb->rowBytes)])) /
                                 rgbMaxChannelF;
                         } else {
-                            a = rgb->pixels[state.rgbOffsetBytesA + (i * state.rgbPixelBytes) + (j * rgb->rowBytes)] / rgbMaxChannelF;
+                            rgbPixel[0] = rgb->pixels[state.rgbOffsetBytesR + (i * state.rgbPixelBytes) + (j * rgb->rowBytes)] /
+                                          rgbMaxChannelF;
+                            rgbPixel[1] = rgb->pixels[state.rgbOffsetBytesG + (i * state.rgbPixelBytes) + (j * rgb->rowBytes)] /
+                                          rgbMaxChannelF;
+                            rgbPixel[2] = rgb->pixels[state.rgbOffsetBytesB + (i * state.rgbPixelBytes) + (j * rgb->rowBytes)] /
+                                          rgbMaxChannelF;
                         }
 
-                        if (alphaMode == AVIF_ALPHA_MULTIPLY_MODE_MULTIPLY) {
-                            if (a == 0) {
-                                rgbPixel[0] = 0;
-                                rgbPixel[1] = 0;
-                                rgbPixel[2] = 0;
-                            } else if (a < 1.0f) {
-                                rgbPixel[0] *= a;
-                                rgbPixel[1] *= a;
-                                rgbPixel[2] *= a;
+                        if (alphaMode != AVIF_ALPHA_MULTIPLY_MODE_NO_OP) {
+                            float a;
+                            if (state.rgbChannelBytes > 1) {
+                                a = *((uint16_t *)(&rgb->pixels[state.rgbOffsetBytesA + (i * state.rgbPixelBytes) + (j * rgb->rowBytes)])) /
+                                    rgbMaxChannelF;
+                            } else {
+                                a = rgb->pixels[state.rgbOffsetBytesA + (i * state.rgbPixelBytes) + (j * rgb->rowBytes)] / rgbMaxChannelF;
+                            }
+
+                            if (alphaMode == AVIF_ALPHA_MULTIPLY_MODE_MULTIPLY) {
+                                if (a == 0) {
+                                    rgbPixel[0] = 0;
+                                    rgbPixel[1] = 0;
+                                    rgbPixel[2] = 0;
+                                } else if (a < 1.0f) {
+                                    rgbPixel[0] *= a;
+                                    rgbPixel[1] *= a;
+                                    rgbPixel[2] *= a;
+                                }
+                            } else {
+                                // alphaMode == AVIF_ALPHA_MULTIPLY_MODE_UNMULTIPLY
+                                if (a == 0) {
+                                    rgbPixel[0] = 0;
+                                    rgbPixel[1] = 0;
+                                    rgbPixel[2] = 0;
+                                } else if (a < 1.0f) {
+                                    rgbPixel[0] /= a;
+                                    rgbPixel[1] /= a;
+                                    rgbPixel[2] /= a;
+                                    rgbPixel[0] = AVIF_MIN(rgbPixel[0], 1.0f);
+                                    rgbPixel[1] = AVIF_MIN(rgbPixel[1], 1.0f);
+                                    rgbPixel[2] = AVIF_MIN(rgbPixel[2], 1.0f);
+                                }
+                            }
+                        }
+
+                        // RGB -> YUV conversion
+                        if (state.mode == AVIF_REFORMAT_MODE_IDENTITY) {
+                            // Formulas 41,42,43 from https://www.itu.int/rec/T-REC-H.273-201612-I/en
+                            yuvBlock[bI][bJ].y = rgbPixel[1]; // G
+                            yuvBlock[bI][bJ].u = rgbPixel[2]; // B
+                            yuvBlock[bI][bJ].v = rgbPixel[0]; // R
+                        } else if (state.mode == AVIF_REFORMAT_MODE_YCGCO) {
+                            // Formulas 44,45,46 from https://www.itu.int/rec/T-REC-H.273-201612-I/en
+                            yuvBlock[bI][bJ].y = 0.5f * rgbPixel[1] + 0.25f * (rgbPixel[0] + rgbPixel[2]);
+                            yuvBlock[bI][bJ].u = 0.5f * rgbPixel[1] - 0.25f * (rgbPixel[0] + rgbPixel[2]);
+                            yuvBlock[bI][bJ].v = 0.5f * (rgbPixel[0] - rgbPixel[2]);
+                        } else {
+                            float Y = (kr * rgbPixel[0]) + (kg * rgbPixel[1]) + (kb * rgbPixel[2]);
+                            yuvBlock[bI][bJ].y = Y;
+                            yuvBlock[bI][bJ].u = (rgbPixel[2] - Y) / (2 * (1 - kb));
+                            yuvBlock[bI][bJ].v = (rgbPixel[0] - Y) / (2 * (1 - kr));
+                        }
+
+                        if (state.yuvChannelBytes > 1) {
+                            uint16_t * pY = (uint16_t *)&yuvPlanes[AVIF_CHAN_Y][(i * 2) + (j * yuvRowBytes[AVIF_CHAN_Y])];
+                            *pY = (uint16_t)avifReformatStateYToUNorm(&state, yuvBlock[bI][bJ].y);
+                            if (image->yuvFormat == AVIF_PIXEL_FORMAT_YUV444) {
+                                // YUV444, full chroma
+                                uint16_t * pU = (uint16_t *)&yuvPlanes[AVIF_CHAN_U][(i * 2) + (j * yuvRowBytes[AVIF_CHAN_U])];
+                                *pU = (uint16_t)avifReformatStateUVToUNorm(&state, yuvBlock[bI][bJ].u);
+                                uint16_t * pV = (uint16_t *)&yuvPlanes[AVIF_CHAN_V][(i * 2) + (j * yuvRowBytes[AVIF_CHAN_V])];
+                                *pV = (uint16_t)avifReformatStateUVToUNorm(&state, yuvBlock[bI][bJ].v);
                             }
                         } else {
-                            // alphaMode == AVIF_ALPHA_MULTIPLY_MODE_UNMULTIPLY
-                            if (a == 0) {
-                                rgbPixel[0] = 0;
-                                rgbPixel[1] = 0;
-                                rgbPixel[2] = 0;
-                            } else if (a < 1.0f) {
-                                rgbPixel[0] /= a;
-                                rgbPixel[1] /= a;
-                                rgbPixel[2] /= a;
-                                rgbPixel[0] = AVIF_MIN(rgbPixel[0], 1.0f);
-                                rgbPixel[1] = AVIF_MIN(rgbPixel[1], 1.0f);
-                                rgbPixel[2] = AVIF_MIN(rgbPixel[2], 1.0f);
+                            yuvPlanes[AVIF_CHAN_Y][i + (j * yuvRowBytes[AVIF_CHAN_Y])] =
+                                (uint8_t)avifReformatStateYToUNorm(&state, yuvBlock[bI][bJ].y);
+                            if (image->yuvFormat == AVIF_PIXEL_FORMAT_YUV444) {
+                                // YUV444, full chroma
+                                yuvPlanes[AVIF_CHAN_U][i + (j * yuvRowBytes[AVIF_CHAN_U])] =
+                                    (uint8_t)avifReformatStateUVToUNorm(&state, yuvBlock[bI][bJ].u);
+                                yuvPlanes[AVIF_CHAN_V][i + (j * yuvRowBytes[AVIF_CHAN_V])] =
+                                    (uint8_t)avifReformatStateUVToUNorm(&state, yuvBlock[bI][bJ].v);
                             }
                         }
                     }
-
-                    // RGB -> YUV conversion
-                    if (state.mode == AVIF_REFORMAT_MODE_IDENTITY) {
-                        // Formulas 41,42,43 from https://www.itu.int/rec/T-REC-H.273-201612-I/en
-                        yuvBlock[bI][bJ].y = rgbPixel[1]; // G
-                        yuvBlock[bI][bJ].u = rgbPixel[2]; // B
-                        yuvBlock[bI][bJ].v = rgbPixel[0]; // R
-                    } else if (state.mode == AVIF_REFORMAT_MODE_YCGCO) {
-                        // Formulas 44,45,46 from https://www.itu.int/rec/T-REC-H.273-201612-I/en
-                        yuvBlock[bI][bJ].y = 0.5f * rgbPixel[1] + 0.25f * (rgbPixel[0] + rgbPixel[2]);
-                        yuvBlock[bI][bJ].u = 0.5f * rgbPixel[1] - 0.25f * (rgbPixel[0] + rgbPixel[2]);
-                        yuvBlock[bI][bJ].v = 0.5f * (rgbPixel[0] - rgbPixel[2]);
-                    } else {
-                        float Y = (kr * rgbPixel[0]) + (kg * rgbPixel[1]) + (kb * rgbPixel[2]);
-                        yuvBlock[bI][bJ].y = Y;
-                        yuvBlock[bI][bJ].u = (rgbPixel[2] - Y) / (2 * (1 - kb));
-                        yuvBlock[bI][bJ].v = (rgbPixel[0] - Y) / (2 * (1 - kr));
-                    }
-
-                    if (state.yuvChannelBytes > 1) {
-                        uint16_t * pY = (uint16_t *)&yuvPlanes[AVIF_CHAN_Y][(i * 2) + (j * yuvRowBytes[AVIF_CHAN_Y])];
-                        *pY = (uint16_t)avifReformatStateYToUNorm(&state, yuvBlock[bI][bJ].y);
-                        if (image->yuvFormat == AVIF_PIXEL_FORMAT_YUV444) {
-                            // YUV444, full chroma
-                            uint16_t * pU = (uint16_t *)&yuvPlanes[AVIF_CHAN_U][(i * 2) + (j * yuvRowBytes[AVIF_CHAN_U])];
-                            *pU = (uint16_t)avifReformatStateUVToUNorm(&state, yuvBlock[bI][bJ].u);
-                            uint16_t * pV = (uint16_t *)&yuvPlanes[AVIF_CHAN_V][(i * 2) + (j * yuvRowBytes[AVIF_CHAN_V])];
-                            *pV = (uint16_t)avifReformatStateUVToUNorm(&state, yuvBlock[bI][bJ].v);
-                        }
-                    } else {
-                        yuvPlanes[AVIF_CHAN_Y][i + (j * yuvRowBytes[AVIF_CHAN_Y])] =
-                            (uint8_t)avifReformatStateYToUNorm(&state, yuvBlock[bI][bJ].y);
-                        if (image->yuvFormat == AVIF_PIXEL_FORMAT_YUV444) {
-                            // YUV444, full chroma
-                            yuvPlanes[AVIF_CHAN_U][i + (j * yuvRowBytes[AVIF_CHAN_U])] =
-                                (uint8_t)avifReformatStateUVToUNorm(&state, yuvBlock[bI][bJ].u);
-                            yuvPlanes[AVIF_CHAN_V][i + (j * yuvRowBytes[AVIF_CHAN_V])] =
-                                (uint8_t)avifReformatStateUVToUNorm(&state, yuvBlock[bI][bJ].v);
-                        }
-                    }
                 }
-            }
 
-            // Populate any subsampled channels with averages from the 2x2 block
-            if (image->yuvFormat == AVIF_PIXEL_FORMAT_YUV420) {
-                // YUV420, average 4 samples (2x2)
+                // Populate any subsampled channels with averages from the 2x2 block
+                if (image->yuvFormat == AVIF_PIXEL_FORMAT_YUV420) {
+                    // YUV420, average 4 samples (2x2)
 
-                float sumU = 0.0f;
-                float sumV = 0.0f;
-                for (int bJ = 0; bJ < blockH; ++bJ) {
-                    for (int bI = 0; bI < blockW; ++bI) {
-                        sumU += yuvBlock[bI][bJ].u;
-                        sumV += yuvBlock[bI][bJ].v;
-                    }
-                }
-                float totalSamples = (float)(blockW * blockH);
-                float avgU = sumU / totalSamples;
-                float avgV = sumV / totalSamples;
-
-                const int chromaShiftX = 1;
-                const int chromaShiftY = 1;
-                int uvI = outerI >> chromaShiftX;
-                int uvJ = outerJ >> chromaShiftY;
-                if (state.yuvChannelBytes > 1) {
-                    uint16_t * pU = (uint16_t *)&yuvPlanes[AVIF_CHAN_U][(uvI * 2) + (uvJ * yuvRowBytes[AVIF_CHAN_U])];
-                    *pU = (uint16_t)avifReformatStateUVToUNorm(&state, avgU);
-                    uint16_t * pV = (uint16_t *)&yuvPlanes[AVIF_CHAN_V][(uvI * 2) + (uvJ * yuvRowBytes[AVIF_CHAN_V])];
-                    *pV = (uint16_t)avifReformatStateUVToUNorm(&state, avgV);
-                } else {
-                    yuvPlanes[AVIF_CHAN_U][uvI + (uvJ * yuvRowBytes[AVIF_CHAN_U])] = (uint8_t)avifReformatStateUVToUNorm(&state, avgU);
-                    yuvPlanes[AVIF_CHAN_V][uvI + (uvJ * yuvRowBytes[AVIF_CHAN_V])] = (uint8_t)avifReformatStateUVToUNorm(&state, avgV);
-                }
-            } else if (image->yuvFormat == AVIF_PIXEL_FORMAT_YUV422) {
-                // YUV422, average 2 samples (1x2), twice
-
-                for (int bJ = 0; bJ < blockH; ++bJ) {
                     float sumU = 0.0f;
                     float sumV = 0.0f;
-                    for (int bI = 0; bI < blockW; ++bI) {
-                        sumU += yuvBlock[bI][bJ].u;
-                        sumV += yuvBlock[bI][bJ].v;
+                    for (int bJ = 0; bJ < blockH; ++bJ) {
+                        for (int bI = 0; bI < blockW; ++bI) {
+                            sumU += yuvBlock[bI][bJ].u;
+                            sumV += yuvBlock[bI][bJ].v;
+                        }
                     }
-                    float totalSamples = (float)blockW;
+                    float totalSamples = (float)(blockW * blockH);
                     float avgU = sumU / totalSamples;
                     float avgV = sumV / totalSamples;
 
                     const int chromaShiftX = 1;
+                    const int chromaShiftY = 1;
                     int uvI = outerI >> chromaShiftX;
-                    int uvJ = outerJ + bJ;
+                    int uvJ = outerJ >> chromaShiftY;
                     if (state.yuvChannelBytes > 1) {
                         uint16_t * pU = (uint16_t *)&yuvPlanes[AVIF_CHAN_U][(uvI * 2) + (uvJ * yuvRowBytes[AVIF_CHAN_U])];
                         *pU = (uint16_t)avifReformatStateUVToUNorm(&state, avgU);
@@ -385,6 +373,35 @@
                         yuvPlanes[AVIF_CHAN_V][uvI + (uvJ * yuvRowBytes[AVIF_CHAN_V])] =
                             (uint8_t)avifReformatStateUVToUNorm(&state, avgV);
                     }
+                } else if (image->yuvFormat == AVIF_PIXEL_FORMAT_YUV422) {
+                    // YUV422, average 2 samples (1x2), twice
+
+                    for (int bJ = 0; bJ < blockH; ++bJ) {
+                        float sumU = 0.0f;
+                        float sumV = 0.0f;
+                        for (int bI = 0; bI < blockW; ++bI) {
+                            sumU += yuvBlock[bI][bJ].u;
+                            sumV += yuvBlock[bI][bJ].v;
+                        }
+                        float totalSamples = (float)blockW;
+                        float avgU = sumU / totalSamples;
+                        float avgV = sumV / totalSamples;
+
+                        const int chromaShiftX = 1;
+                        int uvI = outerI >> chromaShiftX;
+                        int uvJ = outerJ + bJ;
+                        if (state.yuvChannelBytes > 1) {
+                            uint16_t * pU = (uint16_t *)&yuvPlanes[AVIF_CHAN_U][(uvI * 2) + (uvJ * yuvRowBytes[AVIF_CHAN_U])];
+                            *pU = (uint16_t)avifReformatStateUVToUNorm(&state, avgU);
+                            uint16_t * pV = (uint16_t *)&yuvPlanes[AVIF_CHAN_V][(uvI * 2) + (uvJ * yuvRowBytes[AVIF_CHAN_V])];
+                            *pV = (uint16_t)avifReformatStateUVToUNorm(&state, avgV);
+                        } else {
+                            yuvPlanes[AVIF_CHAN_U][uvI + (uvJ * yuvRowBytes[AVIF_CHAN_U])] =
+                                (uint8_t)avifReformatStateUVToUNorm(&state, avgU);
+                            yuvPlanes[AVIF_CHAN_V][uvI + (uvJ * yuvRowBytes[AVIF_CHAN_V])] =
+                                (uint8_t)avifReformatStateUVToUNorm(&state, avgV);
+                        }
+                    }
                 }
             }
         }
@@ -410,6 +427,8 @@
 
             avifReformatAlpha(&params);
         } else {
+            // libyuv does not fill alpha when converting from RGB to YUV so
+            // fill it regardless of the value of convertedWithLibYUV.
             avifFillAlpha(&params);
         }
     }
diff --git a/src/reformat_libyuv.c b/src/reformat_libyuv.c
index 7e42d80..10a6e4e 100644
--- a/src/reformat_libyuv.c
+++ b/src/reformat_libyuv.c
@@ -6,6 +6,12 @@
 #if !defined(AVIF_LIBYUV_ENABLED)
 
 // No libyuv!
+avifResult avifImageRGBToYUVLibYUV(avifImage * image, const avifRGBImage * rgb)
+{
+    (void)image;
+    (void)rgb;
+    return AVIF_RESULT_NOT_IMPLEMENTED;
+}
 avifResult avifImageYUVToRGBLibYUV(const avifImage * image, avifRGBImage * rgb)
 {
     (void)image;
@@ -50,6 +56,122 @@
 #pragma clang diagnostic pop
 #endif
 
+static avifResult avifImageRGBToYUVLibYUV8bpc(avifImage * image, const avifRGBImage * rgb);
+
+avifResult avifImageRGBToYUVLibYUV(avifImage * image, const avifRGBImage * rgb)
+{
+    if ((rgb->chromaDownsampling != AVIF_CHROMA_DOWNSAMPLING_AUTOMATIC) && (rgb->chromaDownsampling != AVIF_CHROMA_DOWNSAMPLING_FASTEST)) {
+        // libyuv uses integer/fixed-point averaging and RGB-to-YUV conversion.
+        // We do not ensure a specific ordering of these two steps and libyuv
+        // may perform one or the other depending on the implementation or
+        // platform. Also libyuv trades a bit of accuracy for speed, so if the
+        // end user requested best quality, avoid using libyuv as well.
+        return AVIF_RESULT_NOT_IMPLEMENTED;
+    }
+
+    if ((image->depth == 8) && (rgb->depth == 8)) {
+        return avifImageRGBToYUVLibYUV8bpc(image, rgb);
+    }
+
+    // This function didn't do anything; use the built-in conversion.
+    return AVIF_RESULT_NOT_IMPLEMENTED;
+}
+
+avifResult avifImageRGBToYUVLibYUV8bpc(avifImage * image, const avifRGBImage * rgb)
+{
+    assert((image->depth == 8) && (rgb->depth == 8));
+    // libavif uses byte-order when describing pixel formats, such that the R in RGBA is the lowest address,
+    // similar to PNG. libyuv orders in word-order, so libavif's RGBA would be referred to in libyuv as ABGR.
+
+    if (image->yuvFormat == AVIF_PIXEL_FORMAT_YUV400) {
+        // Generic mapping from any RGB layout (with or without alpha) to monochrome.
+        int (*RGBtoY)(const uint8_t *, int, uint8_t *, int, int, int) = NULL;
+
+        if (image->yuvRange == AVIF_RANGE_LIMITED) {
+            if (rgb->format == AVIF_RGB_FORMAT_BGRA) {
+                RGBtoY = ARGBToI400;
+            }
+        } else { // image->yuvRange == AVIF_RANGE_FULL
+            if (rgb->format == AVIF_RGB_FORMAT_BGRA) {
+                RGBtoY = ARGBToJ400;
+            }
+        }
+
+        if (!RGBtoY) {
+            return AVIF_RESULT_NOT_IMPLEMENTED;
+        }
+        if (RGBtoY(rgb->pixels, rgb->rowBytes, image->yuvPlanes[AVIF_CHAN_Y], image->yuvRowBytes[AVIF_CHAN_Y], image->width, image->height)) {
+            return AVIF_RESULT_REFORMAT_FAILED;
+        }
+        return AVIF_RESULT_OK;
+    }
+
+    // Generic mapping from any RGB layout (with or without alpha) to any YUV layout (subsampled or not).
+    int (*RGBtoYUV)(const uint8_t *, int, uint8_t *, int, uint8_t *, int, uint8_t *, int, int, int) = NULL;
+
+    // libyuv only handles BT.601 for RGB to YUV, and not all range/order/subsampling combinations.
+    // BT.470BG has the same coefficients as BT.601.
+    if ((image->matrixCoefficients == AVIF_MATRIX_COEFFICIENTS_BT470BG) || (image->matrixCoefficients == AVIF_MATRIX_COEFFICIENTS_BT601)) {
+        if (image->yuvRange == AVIF_RANGE_LIMITED) {
+            if (rgb->format == AVIF_RGB_FORMAT_RGBA) {
+                if (image->yuvFormat == AVIF_PIXEL_FORMAT_YUV420) {
+                    RGBtoYUV = ABGRToI420;
+                }
+            } else if (rgb->format == AVIF_RGB_FORMAT_ARGB) {
+                if (image->yuvFormat == AVIF_PIXEL_FORMAT_YUV420) {
+                    RGBtoYUV = BGRAToI420;
+                }
+            } else if (rgb->format == AVIF_RGB_FORMAT_BGR) {
+                if (image->yuvFormat == AVIF_PIXEL_FORMAT_YUV420) {
+                    RGBtoYUV = RGB24ToI420;
+                }
+            } else if (rgb->format == AVIF_RGB_FORMAT_BGRA) {
+                if (image->yuvFormat == AVIF_PIXEL_FORMAT_YUV444) {
+                    RGBtoYUV = ARGBToI444;
+                } else if (image->yuvFormat == AVIF_PIXEL_FORMAT_YUV422) {
+                    RGBtoYUV = ARGBToI422;
+                } else if (image->yuvFormat == AVIF_PIXEL_FORMAT_YUV420) {
+                    RGBtoYUV = ARGBToI420;
+                }
+            } else if (rgb->format == AVIF_RGB_FORMAT_ABGR) {
+                if (image->yuvFormat == AVIF_PIXEL_FORMAT_YUV420) {
+                    RGBtoYUV = RGBAToI420;
+                }
+            }
+        } else { // image->yuvRange == AVIF_RANGE_FULL
+            if (rgb->format == AVIF_RGB_FORMAT_BGR) {
+                if (image->yuvFormat == AVIF_PIXEL_FORMAT_YUV420) {
+                    RGBtoYUV = RGB24ToJ420;
+                }
+            } else if (rgb->format == AVIF_RGB_FORMAT_BGRA) {
+                if (image->yuvFormat == AVIF_PIXEL_FORMAT_YUV422) {
+                    RGBtoYUV = ARGBToJ422;
+                } else if (image->yuvFormat == AVIF_PIXEL_FORMAT_YUV420) {
+                    RGBtoYUV = ARGBToJ420;
+                }
+            }
+        }
+    }
+    // TODO: Use SplitRGBPlane() for AVIF_MATRIX_COEFFICIENTS_IDENTITY if faster than the current implementation
+
+    if (!RGBtoYUV) {
+        return AVIF_RESULT_NOT_IMPLEMENTED;
+    }
+    if (RGBtoYUV(rgb->pixels,
+                 rgb->rowBytes,
+                 image->yuvPlanes[AVIF_CHAN_Y],
+                 image->yuvRowBytes[AVIF_CHAN_Y],
+                 image->yuvPlanes[AVIF_CHAN_U],
+                 image->yuvRowBytes[AVIF_CHAN_U],
+                 image->yuvPlanes[AVIF_CHAN_V],
+                 image->yuvRowBytes[AVIF_CHAN_V],
+                 image->width,
+                 image->height)) {
+        return AVIF_RESULT_REFORMAT_FAILED;
+    }
+    return AVIF_RESULT_OK;
+}
+
 static avifResult avifImageYUVToRGBLibYUV8bpc(const avifImage * image,
                                               avifRGBImage * rgb,
                                               const struct YuvConstants * matrixYUV,
diff --git a/tests/gtest/avifrgbtoyuvtest.cc b/tests/gtest/avifrgbtoyuvtest.cc
index bf4533e..ef826a8 100644
--- a/tests/gtest/avifrgbtoyuvtest.cc
+++ b/tests/gtest/avifrgbtoyuvtest.cc
@@ -183,6 +183,10 @@
       SetImageChannel(&src_rgb, offsets.b, r,
                       add_noise ? kBlueNoise : kPlainColor);
 
+      // Change these to BEST_QUALITY to force built-in over libyuv conversion.
+      src_rgb.chromaDownsampling = AVIF_CHROMA_DOWNSAMPLING_AUTOMATIC;
+      dst_rgb.chromaUpsampling = AVIF_CHROMA_UPSAMPLING_AUTOMATIC;
+
       ASSERT_EQ(avifImageRGBToYUV(yuv.get(), &src_rgb), AVIF_RESULT_OK);
       ASSERT_EQ(avifImageYUVToRGB(yuv.get(), &dst_rgb), AVIF_RESULT_OK);
       GetDiffSumAndSqDiffSum(src_rgb, dst_rgb, &diff_sum, &abs_diff_sum,
@@ -299,7 +303,7 @@
         /*add_noise=*/Values(false),
         /*rgb_step=*/Values(17),
         /*max_abs_average_diff=*/Values(0.02),  // The color drift is centered.
-        /*min_psnr=*/Values(52.)  // RGB>YUV>RGB distortion is barely
+        /*min_psnr=*/Values(49.)  // RGB>YUV>RGB distortion is barely
                                   // noticeable.
         ));
 
@@ -383,5 +387,20 @@
 
 // TODO: Test other matrix coefficients than identity and bt.601.
 
+// This was used to estimate the quality loss of libyuv for RGB-to-YUV.
+// Disabled because it takes a few minutes.
+INSTANTIATE_TEST_SUITE_P(
+    DISABLED_All8bTo8b, RGBToYUVTest,
+    Combine(/*rgb_depth=*/Values(8),
+            /*yuv_depth=*/Values(8), ValuesIn(kAllRgbFormats),
+            Values(AVIF_PIXEL_FORMAT_YUV444, AVIF_PIXEL_FORMAT_YUV422,
+                   AVIF_PIXEL_FORMAT_YUV420, AVIF_PIXEL_FORMAT_YUV400),
+            Values(AVIF_RANGE_FULL, AVIF_RANGE_LIMITED),
+            Values(AVIF_MATRIX_COEFFICIENTS_BT601),
+            /*add_noise=*/Values(false, true),
+            /*rgb_step=*/Values(3),  // way faster and 99% similar to rgb_step=1
+            /*max_abs_average_diff=*/Values(10.),
+            /*min_psnr=*/Values(10.)));
+
 }  // namespace
 }  // namespace libavif