Read extended XMP in avifjpeg.c

Add test unit and test images.
diff --git a/apps/shared/avifjpeg.c b/apps/shared/avifjpeg.c
index 29d915e..8e6356e 100644
--- a/apps/shared/avifjpeg.c
+++ b/apps/shared/avifjpeg.c
@@ -6,6 +6,7 @@
 
 #include <assert.h>
 #include <setjmp.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -225,6 +226,34 @@
     return AVIF_FALSE;
 }
 
+// Reads 4-byte unsigned integer in big-endian format from the raw bitstream src.
+static uint32_t avifJPEGReadUint32BigEndian(const uint8_t * src)
+{
+    return ((uint32_t)src[0] << 24) | ((uint32_t)src[1] << 16) | ((uint32_t)src[2] << 8) | ((uint32_t)src[3] << 0);
+}
+
+// Returns the pointer in str to the first occurrence of substr. Returns NULL if substr cannot be found in str.
+static const uint8_t * avifJPEGFindSubstr(const uint8_t * str, size_t strLength, const uint8_t * substr, size_t substrLength)
+{
+    for (size_t index = 0; index + substrLength <= strLength; ++index) {
+        if (!memcmp(&str[index], substr, substrLength)) {
+            return &str[index];
+        }
+    }
+    return NULL;
+}
+
+#define AVIF_EXTENDED_XMP_GUID_LENGTH 32
+// One way of storing the Extended XMP GUID (generated by a camera for example).
+#define AVIF_XMP_NOTE_TAG "xmpNote:HasExtendedXMP=\""
+#define AVIF_XMP_NOTE_TAG_LENGTH 24
+// Another way of storing the Extended XMP GUID (generated by exiftool for example).
+#define AVIF_ALTERNATIVE_XMP_NOTE_TAG "<xmpNote:HasExtendedXMP>"
+#define AVIF_ALTERNATIVE_XMP_NOTE_TAG_LENGTH 24
+
+// Offset in APP1 segment (skip tag + guid + size + offset).
+#define AVIF_OFFSET_TILL_EXTENDED_XMP (tagExtendedXMP.size + AVIF_EXTENDED_XMP_GUID_LENGTH + 4 + 4)
+
 // Note on setjmp() and volatile variables:
 //
 // K & R, The C Programming Language 2nd Ed, p. 254 says:
@@ -254,6 +283,11 @@
     avifRGBImage rgb;
     memset(&rgb, 0, sizeof(avifRGBImage));
 
+    // Standard XMP segment followed by all extended XMP segments.
+    avifRWData totalXMP = { NULL, 0 };
+    // Each byte set to 0 is a missing byte. Each byte set to 1 was read and copied to totalXMP.
+    avifRWData extendedXMPReadBytes = { NULL, 0 };
+
     FILE * f = fopen(inputFilename, "rb");
     if (!f) {
         fprintf(stderr, "Can't open JPEG file for read: %s\n", inputFilename);
@@ -343,7 +377,7 @@
                 !memcmp(marker->data, tagExif.data, tagExif.size)) {
                 if (found) {
                     // TODO(yguyon): Implement instead of outputting an error.
-                    fprintf(stderr, "Exif extraction failed: unsupported Exif split into multiple chunks or invalid multiple Exif chunks\n");
+                    fprintf(stderr, "Exif extraction failed: unsupported Exif split into multiple segments or invalid multiple Exif segments\n");
                     goto cleanup;
                 }
                 avifImageSetMetadataExif(avif, marker->data + tagExif.size, marker->data_length - tagExif.size);
@@ -354,24 +388,134 @@
         }
     }
     if (!ignoreXMP) {
-        const avifROData tagStandardXmp = { (const uint8_t *)"http://ns.adobe.com/xap/1.0/\0", 29 };
-        const avifROData tagExtendedXmp = { (const uint8_t *)"http://ns.adobe.com/xmp/extension/\0", 35 };
-        avifBool found = AVIF_FALSE;
+        const avifROData tagStandardXMP = { (const uint8_t *)"http://ns.adobe.com/xap/1.0/\0", 29 };
+        const uint8_t * standardXMPData = NULL;
+        uint32_t standardXMPSize = 0; // At most 64kB as defined by Adobe XMP Specification Part 3.
         for (jpeg_saved_marker_ptr marker = cinfo.marker_list; marker != NULL; marker = marker->next) {
-            if ((marker->marker == (JPEG_APP0 + 1)) && (marker->data_length > tagStandardXmp.size) &&
-                !memcmp(marker->data, tagStandardXmp.data, tagStandardXmp.size)) {
-                if (found) {
-                    fprintf(stderr, "XMP extraction failed: invalid multiple XMP chunks\n");
+            if ((marker->marker == (JPEG_APP0 + 1)) && (marker->data_length > tagStandardXMP.size) &&
+                !memcmp(marker->data, tagStandardXMP.data, tagStandardXMP.size)) {
+                if (standardXMPData) {
+                    fprintf(stderr, "XMP extraction failed: invalid multiple standard XMP segments\n");
                     goto cleanup;
                 }
-                avifImageSetMetadataXMP(avif, marker->data + tagStandardXmp.size, marker->data_length - tagStandardXmp.size);
-                found = AVIF_TRUE;
-            } else if ((marker->marker == (JPEG_APP0 + 1)) && (marker->data_length > tagExtendedXmp.size) &&
-                       !memcmp(marker->data, tagExtendedXmp.data, tagExtendedXmp.size)) {
-                // TODO(yguyon): Implement instead of outputting an error.
-                fprintf(stderr, "XMP extraction failed: extended XMP is unsupported\n");
+                standardXMPData = marker->data + tagStandardXMP.size;
+                standardXMPSize = (uint32_t)(marker->data_length - tagStandardXMP.size);
+            }
+        }
+
+        const avifROData tagExtendedXMP = { (const uint8_t *)"http://ns.adobe.com/xmp/extension/\0", 35 };
+        avifBool foundExtendedXMP = AVIF_FALSE;
+        uint8_t extendedXMPGUID[AVIF_EXTENDED_XMP_GUID_LENGTH]; // The value is common to all extended XMP segments.
+        for (jpeg_saved_marker_ptr marker = cinfo.marker_list; marker != NULL; marker = marker->next) {
+            if ((marker->marker == (JPEG_APP0 + 1)) && (marker->data_length > tagExtendedXMP.size) &&
+                !memcmp(marker->data, tagExtendedXMP.data, tagExtendedXMP.size)) {
+                if (!standardXMPData) {
+                    fprintf(stderr, "XMP extraction failed: extended XMP segment found, missing standard XMP segment\n");
+                    goto cleanup;
+                }
+
+                if (marker->data_length < AVIF_OFFSET_TILL_EXTENDED_XMP) {
+                    fprintf(stderr, "XMP extraction failed: truncated extended XMP segment\n");
+                    goto cleanup;
+                }
+                const uint8_t * guid = &marker->data[tagExtendedXMP.size];
+                for (size_t c = 0; c < AVIF_EXTENDED_XMP_GUID_LENGTH; ++c) {
+                    // According to Adobe XMP Specification Part 3 section 1.1.3.1:
+                    //   "128-bit GUID stored as a 32-byte ASCII hex string, capital A-F, no null termination"
+                    if (((guid[c] < '0') || (guid[c] > '9')) && ((guid[c] < 'A') || (guid[c] > 'F'))) {
+                        fprintf(stderr, "XMP extraction failed: invalid XMP segment GUID\n");
+                        goto cleanup;
+                    }
+                }
+                // Size of the current extended segment.
+                const size_t extendedXMPSize = marker->data_length - AVIF_OFFSET_TILL_EXTENDED_XMP;
+                // Expected size of the sum of all extended segments.
+                // According to Adobe XMP Specification Part 3 section 1.1.3.1:
+                //   "full length of the ExtendedXMP serialization as a 32-bit unsigned integer"
+                const uint32_t totalExtendedXMPSize =
+                    avifJPEGReadUint32BigEndian(&marker->data[tagExtendedXMP.size + AVIF_EXTENDED_XMP_GUID_LENGTH]);
+                // Offset in totalXMP after standardXMP.
+                // According to Adobe XMP Specification Part 3 section 1.1.3.1:
+                //   "offset of this portion as a 32-bit unsigned integer"
+                const uint32_t extendedXMPOffset =
+                    avifJPEGReadUint32BigEndian(&marker->data[tagExtendedXMP.size + AVIF_EXTENDED_XMP_GUID_LENGTH + 4]);
+                if (((uint64_t)standardXMPSize + totalExtendedXMPSize) > SIZE_MAX) {
+                    fprintf(stderr, "XMP extraction failed: total XMP size is too large\n");
+                    goto cleanup;
+                }
+                if ((extendedXMPSize == 0) || (((uint64_t)extendedXMPOffset + extendedXMPSize) > totalExtendedXMPSize)) {
+                    fprintf(stderr, "XMP extraction failed: invalid extended XMP segment size or offset\n");
+                    goto cleanup;
+                }
+                if (foundExtendedXMP) {
+                    if (memcmp(guid, extendedXMPGUID, AVIF_EXTENDED_XMP_GUID_LENGTH)) {
+                        fprintf(stderr, "XMP extraction failed: extended XMP segment GUID mismatch\n");
+                        goto cleanup;
+                    }
+                    if (totalExtendedXMPSize != (totalXMP.size - standardXMPSize)) {
+                        fprintf(stderr, "XMP extraction failed: extended XMP total size mismatch\n");
+                        goto cleanup;
+                    }
+                } else {
+                    memcpy(extendedXMPGUID, guid, AVIF_EXTENDED_XMP_GUID_LENGTH);
+
+                    avifRWDataRealloc(&totalXMP, (size_t)standardXMPSize + totalExtendedXMPSize);
+                    memcpy(totalXMP.data, standardXMPData, standardXMPSize);
+
+                    // Keep track of the bytes that were set.
+                    avifRWDataRealloc(&extendedXMPReadBytes, totalExtendedXMPSize);
+                    memset(extendedXMPReadBytes.data, 0, extendedXMPReadBytes.size);
+
+                    foundExtendedXMP = AVIF_TRUE;
+                }
+                // According to Adobe XMP Specification Part 3 section 1.1.3.1:
+                //   "A robust JPEG reader should tolerate the marker segments in any order."
+                memcpy(&totalXMP.data[standardXMPSize + extendedXMPOffset], &marker->data[AVIF_OFFSET_TILL_EXTENDED_XMP], extendedXMPSize);
+
+                // Make sure no previously read data was overwritten by the current segment.
+                if (memchr(&extendedXMPReadBytes.data[extendedXMPOffset], 1, extendedXMPSize)) {
+                    fprintf(stderr, "XMP extraction failed: overlapping extended XMP segments\n");
+                    goto cleanup;
+                }
+                // Keep track of the bytes that were set.
+                memset(&extendedXMPReadBytes.data[extendedXMPOffset], 1, extendedXMPSize);
+            }
+        }
+
+        if (foundExtendedXMP) {
+            // Make sure there is no missing byte.
+            if (memchr(extendedXMPReadBytes.data, 0, extendedXMPReadBytes.size)) {
+                fprintf(stderr, "XMP extraction failed: missing extended XMP segments\n");
                 goto cleanup;
             }
+
+            // According to Adobe XMP Specification Part 3 section 1.1.3.1:
+            //   "A reader must incorporate only ExtendedXMP blocks whose GUID matches the value of xmpNote:HasExtendedXMP."
+            uint8_t xmpNote[AVIF_XMP_NOTE_TAG_LENGTH + AVIF_EXTENDED_XMP_GUID_LENGTH];
+            memcpy(xmpNote, AVIF_XMP_NOTE_TAG, AVIF_XMP_NOTE_TAG_LENGTH);
+            memcpy(xmpNote + AVIF_XMP_NOTE_TAG_LENGTH, extendedXMPGUID, AVIF_EXTENDED_XMP_GUID_LENGTH);
+            if (!avifJPEGFindSubstr(standardXMPData, standardXMPSize, xmpNote, sizeof(xmpNote))) {
+                // Try the alternative before returning an error.
+                uint8_t alternativeXmpNote[AVIF_ALTERNATIVE_XMP_NOTE_TAG_LENGTH + AVIF_EXTENDED_XMP_GUID_LENGTH];
+                memcpy(alternativeXmpNote, AVIF_ALTERNATIVE_XMP_NOTE_TAG, AVIF_ALTERNATIVE_XMP_NOTE_TAG_LENGTH);
+                memcpy(alternativeXmpNote + AVIF_ALTERNATIVE_XMP_NOTE_TAG_LENGTH, extendedXMPGUID, AVIF_EXTENDED_XMP_GUID_LENGTH);
+                if (!avifJPEGFindSubstr(standardXMPData, standardXMPSize, alternativeXmpNote, sizeof(alternativeXmpNote))) {
+                    fprintf(stderr, "XMP extraction failed: standard and extended XMP GUID mismatch\n");
+                    goto cleanup;
+                }
+            }
+
+            // According to Adobe XMP Specification Part 3 section 1.1.3.1:
+            //   "A JPEG reader must [...] remove the xmpNote:HasExtendedXMP property."
+            // This constraint is ignored here because leaving the xmpNote:HasExtendedXMP property is rather harmless
+            // and editing XMP metadata is quite involved.
+
+            avifRWDataFree(&avif->xmp);
+            avif->xmp = totalXMP;
+            totalXMP.data = NULL;
+            totalXMP.size = 0;
+        } else if (standardXMPData) {
+            avifImageSetMetadataXMP(avif, standardXMPData, standardXMPSize);
         }
     }
     jpeg_finish_decompress(&cinfo);
@@ -381,6 +525,8 @@
     fclose(f);
     free(iccData);
     avifRGBImageFreePixels(&rgb);
+    avifRWDataFree(&totalXMP);
+    avifRWDataFree(&extendedXMPReadBytes);
     return ret;
 }
 
diff --git a/tests/data/README.md b/tests/data/README.md
index b9d9420..ff23109 100644
--- a/tests/data/README.md
+++ b/tests/data/README.md
@@ -33,6 +33,25 @@
 
 The structure can be displayed using `exiv2 -pS <file>`.
 
+### File [dog_exif_extended_xmp_icc.jpg](dog_exif_extended_xmp_icc.jpg)
+
+![](dog_exif_extended_xmp_icc.jpg)
+
+License: [same as libavif](https://github.com/AOMediaCodec/libavif/blob/main/LICENSE)
+
+Source: Personal photo.
+
+| address | marker      | length | data                                         |
+|--------:|-------------|-------:|----------------------------------------------|
+|       0 | 0xffd8 SOI  |        |                                              |
+|       2 | 0xffe1 APP1 |    884 | `Exif..II*......................`            |
+|     888 | 0xffe1 APP1 |    353 | `http://ns.adobe.com/xap/1.0/.<x:`           |
+|    1243 | 0xffe1 APP1 |  32417 | `http://ns.adobe.com/xmp/extensio`           |
+|   33662 | 0xffe0 APP0 |     16 | `JFIF.........`                              |
+|         |             |        | ...                                          |
+|   33818 | 0xffe2 APP2 |    612 | `ICC_PROFILE......T........mntrRG chunk 1/1` |
+|         |             |        | ...                                          |
+
 ### File [paris_exif_xmp_icc.jpg](paris_exif_xmp_icc.jpg)
 
 ![](paris_exif_xmp_icc.jpg)
@@ -50,6 +69,30 @@
 |    5087 | 0xffe2 APP2 |    612 | `ICC_PROFILE......T........mntrRG chunk 1/1` |
 |         |             |        | ...                                          |
 
+### File [paris_extended_xmp.jpg](paris_extended_xmp.jpg)
+
+![](paris_extended_xmp.jpg)
+
+License: [same as libavif](https://github.com/AOMediaCodec/libavif/blob/main/LICENSE)
+
+Source: Metadata was extracted from `paris_exif_xmp_icc.jpg` with
+`exiftool -tagsfromfile paris_exif_xmp_icc.jpg paris_exif_xmp_icc.xmp`. The text of the first book of
+[De finibus bonorum et malorum](https://en.wikipedia.org/wiki/De_finibus_bonorum_et_malorum) was manually inserted in
+that file under the tag `xmp:Label` and the second book under the tag `xmp:Nickname` (any `<` or `>` removed to avoid
+conflicts with XMP). The file was reconstructed with
+`exiftool -tagsfromfile paris_exif_xmp_icc.xmp -Exif= -icc_profile= paris_exif_xmp_icc.jpg -o paris_extended_xmp.jpg`.
+The goal is to have a large XMP blob so that it can only be stored as multiple extended XMP chunks.
+
+|  address | marker      | length | data                             |
+|---------:|-------------|-------:|----------------------------------|
+|        0 | 0xffd8 SOI  |        |                                  |
+|        2 | 0xffe0 APP0 |     16 | `JFIF.....,.,.`                  |
+|       20 | 0xffe1 APP1 |   5531 | http://ns.adobe.com/xap/1.0/.<?x |
+|     5553 | 0xffe1 APP1 |  65535 | http://ns.adobe.com/xmp/extensio |
+|    71090 | 0xffe1 APP1 |  65535 | http://ns.adobe.com/xmp/extensio |
+|   136627 | 0xffe1 APP1 |   4791 | http://ns.adobe.com/xmp/extensio |
+|          |             |        | ...                              |
+
 ### File [paris_icc_exif_xmp.png](paris_icc_exif_xmp.png)
 
 ![](paris_icc_exif_xmp.png)
diff --git a/tests/data/dog_exif_extended_xmp_icc.jpg b/tests/data/dog_exif_extended_xmp_icc.jpg
new file mode 100644
index 0000000..0ec3c88
--- /dev/null
+++ b/tests/data/dog_exif_extended_xmp_icc.jpg
Binary files differ
diff --git a/tests/data/paris_extended_xmp.jpg b/tests/data/paris_extended_xmp.jpg
new file mode 100644
index 0000000..8ba5086
--- /dev/null
+++ b/tests/data/paris_extended_xmp.jpg
Binary files differ
diff --git a/tests/gtest/avifmetadatatest.cc b/tests/gtest/avifmetadatatest.cc
index 7372b97..fa4f14c 100644
--- a/tests/gtest/avifmetadatatest.cc
+++ b/tests/gtest/avifmetadatatest.cc
@@ -310,6 +310,22 @@
 
 //------------------------------------------------------------------------------
 
+TEST(MetadataTest, ExtendedXMP) {
+  const testutil::AvifImagePtr image =
+      testutil::ReadImage(data_path, "dog_exif_extended_xmp_icc.jpg");
+  ASSERT_NE(image, nullptr);
+  ASSERT_NE(image->xmp.size, 0u);
+}
+
+TEST(MetadataTest, MultipleExtendedXMPAndAlternativeGUIDTag) {
+  const testutil::AvifImagePtr image =
+      testutil::ReadImage(data_path, "paris_extended_xmp.jpg");
+  ASSERT_NE(image, nullptr);
+  ASSERT_GT(image->xmp.size, size_t{65536 * 2});
+}
+
+//------------------------------------------------------------------------------
+
 }  // namespace
 }  // namespace libavif