Fix Extended XMP parsing (#2909)

diff --git a/apps/shared/avifjpeg.c b/apps/shared/avifjpeg.c
index 575eeb9..300b3a8 100644
--- a/apps/shared/avifjpeg.c
+++ b/apps/shared/avifjpeg.c
@@ -435,6 +435,7 @@
 
 #define XML_NAME_SPACE_GAIN_MAP "http://ns.adobe.com/hdr-gain-map/1.0/"
 #define XML_NAME_SPACE_RDF "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+#define XML_NAME_SPACE_XMP_NOTE "http://ns.adobe.com/xmp/note/"
 
 // Finds an 'rdf:Description' node containing a gain map version attribute (hdrgm:Version="1.0").
 // Returns NULL if not found.
@@ -466,19 +467,12 @@
     return NULL;
 }
 
-// Use XML_PARSE_RECOVER and XML_PARSE_NOERROR to avoid failing/printing errors for invalid XML.
-// In particular, if the jpeg files contains extended XMP, avifJPEGReadInternal simply concatenates it to
-// standard XMP, which is not a valid XML tree.
-// TODO(maryla): better handle extended XMP. If the gain map metadata is in the extended part,
-// the current code won't detect it.
-#define LIBXML2_XML_PARSING_FLAGS (XML_PARSE_RECOVER | XML_PARSE_NOERROR)
-
 // Returns true if there is an 'rdf:Description' node containing a gain map version attribute (hdrgm:Version="1.0").
 // On the main image, this signals that the file also contains a gain map.
 // On a subsequent image, this signals that it is a gain map.
 static avifBool avifJPEGHasGainMapXMPNode(const uint8_t * xmpData, size_t xmpSize)
 {
-    xmlDoc * document = xmlReadMemory((const char *)xmpData, (int)xmpSize, NULL, NULL, LIBXML2_XML_PARSING_FLAGS);
+    xmlDoc * document = xmlReadMemory((const char *)xmpData, (int)xmpSize, NULL, NULL, /*options=*/0);
     if (document == NULL) {
         return AVIF_FALSE; // Probably and out of memory error.
     }
@@ -660,7 +654,7 @@
 // Returns AVIF_TRUE if the gain map metadata was successfully read.
 avifBool avifJPEGParseGainMapXMP(const uint8_t * xmpData, size_t xmpSize, avifGainMap * gainMap)
 {
-    xmlDoc * document = xmlReadMemory((const char *)xmpData, (int)xmpSize, NULL, NULL, LIBXML2_XML_PARSING_FLAGS);
+    xmlDoc * document = xmlReadMemory((const char *)xmpData, (int)xmpSize, NULL, NULL, /*options=*/0);
     if (document == NULL) {
         return AVIF_FALSE; // Probably an out of memory error.
     }
@@ -848,6 +842,117 @@
     }
     return AVIF_FALSE;
 }
+
+// Merges the standard XMP data with the extended XMP data.
+// Returns AVIF_FALSE if an error occurred.
+static avifBool avifJPEGMergeXMP(const uint8_t * standardXMPData,
+                                 uint32_t standardXMPSize,
+                                 const avifRWData extendedXMP,
+                                 avifBool foundAlternativeXMPNote,
+                                 avifRWData * xmp)
+{
+    // Initialize the XMP RDF.
+    avifBool isValid = AVIF_TRUE;
+    xmlDoc * extendedXMPDoc = NULL;
+    xmlChar * xmlBuff = NULL;
+    xmlDoc * xmpDoc = xmlReadMemory((const char *)standardXMPData, (int)standardXMPSize, "standard.xml", NULL, /*options=*/0);
+    xmlNode * xmpRdf = (xmlNode *)avifJPEGFindXMLNodeByName(xmlDocGetRootElement(xmpDoc),
+                                                            XML_NAME_SPACE_RDF,
+                                                            "RDF",
+                                                            /*recursive=*/AVIF_TRUE);
+    if (!xmpRdf) {
+        fprintf(stderr, "XMP extraction failed: cannot find RDF node\n");
+        isValid = AVIF_FALSE;
+        goto cleanup_xml;
+    }
+    // According to Adobe XMP Specification Part 3 section 1.1.3.1:
+    //   "A JPEG reader must [...] remove the xmpNote:HasExtendedXMP property."
+    avifBool foundHasExtendedXMP = AVIF_FALSE;
+    xmlNode * descNode = xmpRdf->children;
+    while (!foundHasExtendedXMP && descNode != NULL) {
+        if (descNode->type == XML_ELEMENT_NODE && descNode->ns != NULL &&
+            xmlStrcmp(descNode->ns->href, (const xmlChar *)XML_NAME_SPACE_RDF) == 0 &&
+            xmlStrcmp(descNode->name, (const xmlChar *)"Description") == 0) {
+            // Remove the HasExtendedXMP property.
+            if (foundAlternativeXMPNote) {
+                xmlNodePtr cur = descNode->children;
+                while (cur != NULL) {
+                    if (cur->type == XML_ELEMENT_NODE && cur->ns != NULL && xmlStrcmp(cur->name, (const xmlChar *)"HasExtendedXMP") == 0 &&
+                        xmlStrcmp(cur->ns->href, (const xmlChar *)XML_NAME_SPACE_XMP_NOTE) == 0) {
+                        // We must Unlink and Free the node.
+                        xmlUnlinkNode(cur);
+                        xmlFreeNode(cur);
+                        foundHasExtendedXMP = AVIF_TRUE;
+                        break;
+                    }
+                    cur = cur->next;
+                }
+            } else {
+                xmlAttrPtr attr = xmlHasNsProp(descNode, (const xmlChar *)"HasExtendedXMP", (const xmlChar *)XML_NAME_SPACE_XMP_NOTE);
+
+                if (attr) {
+                    xmlRemoveProp(attr);
+                    foundHasExtendedXMP = AVIF_TRUE;
+                    break;
+                }
+            }
+        }
+        // Check next sibling in case there are multiple Descriptions.
+        descNode = descNode->next;
+    }
+    if (!foundHasExtendedXMP) {
+        fprintf(stderr, "XMP extraction failed: cannot find HasExtendedXMP property\n");
+        isValid = AVIF_FALSE;
+        goto cleanup_xml;
+    }
+
+    // Read the extended XMP.
+    extendedXMPDoc = xmlReadMemory((const char *)extendedXMP.data,
+                                   (int)extendedXMP.size,
+                                   "extended.xml",
+                                   NULL,
+                                   /*options=*/0);
+    const xmlNode * extendedXMPRdf = avifJPEGFindXMLNodeByName(xmlDocGetRootElement(extendedXMPDoc),
+                                                               XML_NAME_SPACE_RDF,
+                                                               "RDF",
+                                                               /*recursive=*/AVIF_TRUE);
+    if (!extendedXMPRdf) {
+        fprintf(stderr, "XMP extraction failed: invalid standard XMP segment\n");
+        isValid = AVIF_FALSE;
+        goto cleanup_xml;
+    }
+    // Copy the extended nodes over.
+    xmlNode * cur = extendedXMPRdf->xmlChildrenNode;
+    while (cur != NULL) {
+        // Copy the child.
+        xmlNode * childCopy = xmlDocCopyNode(cur, xmpDoc, 1);
+        xmlAddChild(xmpRdf, childCopy);
+        cur = cur->next;
+    }
+
+    // Dump the new XMP to avif->xmp.
+    int buffer_size;
+    xmlDocDumpFormatMemory(xmpDoc, &xmlBuff, &buffer_size, 1);
+    if (xmlBuff == NULL) {
+        fprintf(stderr, "Error: Could not dump XML to memory.\n");
+        isValid = AVIF_FALSE;
+        goto cleanup_xml;
+    }
+
+    avifRWDataFree(xmp);
+    if (avifRWDataRealloc(xmp, (size_t)buffer_size) != AVIF_RESULT_OK) {
+        fprintf(stderr, "XMP copy failed: out of memory\n");
+        isValid = AVIF_FALSE;
+        goto cleanup_xml;
+    }
+    memcpy(xmp->data, xmlBuff, buffer_size);
+cleanup_xml:
+    xmlFreeDoc(xmpDoc);
+    xmlFreeDoc(extendedXMPDoc);
+    xmlFree(xmlBuff);
+    return isValid;
+}
+
 #endif // AVIF_ENABLE_JPEG_GAIN_MAP_CONVERSION
 
 // Note on setjmp() and volatile variables:
@@ -882,8 +987,8 @@
     avifRGBImage rgb;
     memset(&rgb, 0, sizeof(avifRGBImage));
 
-    // Standard XMP segment followed by all extended XMP segments.
-    avifRWData totalXMP = { NULL, 0 };
+    // Extended XMP after concatenation of all extended XMP segments.
+    avifRWData extendedXMP = { NULL, 0 };
     // Each byte set to 0 is a missing byte. Each byte set to 1 was read and copied to totalXMP.
     avifRWData extendedXMPReadBytes = { NULL, 0 };
 
@@ -1115,21 +1220,16 @@
                         fprintf(stderr, "XMP extraction failed: extended XMP segment GUID mismatch\n");
                         goto cleanup;
                     }
-                    if (totalExtendedXMPSize != (totalXMP.size - standardXMPSize)) {
+                    if (totalExtendedXMPSize != extendedXMP.size) {
                         fprintf(stderr, "XMP extraction failed: extended XMP total size mismatch\n");
                         goto cleanup;
                     }
                 } else {
                     memcpy(extendedXMPGUID, guid, AVIF_JPEG_EXTENDED_XMP_GUID_LENGTH);
 
-                    if (avifRWDataRealloc(&totalXMP, (size_t)standardXMPSize + totalExtendedXMPSize) != AVIF_RESULT_OK) {
-                        fprintf(stderr, "XMP extraction failed: out of memory\n");
-                        goto cleanup;
-                    }
-                    memcpy(totalXMP.data, standardXMPData, standardXMPSize);
-
-                    // Keep track of the bytes that were set.
-                    if (avifRWDataRealloc(&extendedXMPReadBytes, totalExtendedXMPSize) != AVIF_RESULT_OK) {
+                    // Allocate the extended XMP and keep track of the bytes that were set.
+                    if (avifRWDataRealloc(&extendedXMP, (size_t)totalExtendedXMPSize) != AVIF_RESULT_OK ||
+                        avifRWDataRealloc(&extendedXMPReadBytes, totalExtendedXMPSize) != AVIF_RESULT_OK) {
                         fprintf(stderr, "XMP extraction failed: out of memory\n");
                         goto cleanup;
                     }
@@ -1139,7 +1239,7 @@
                 }
                 // According to Adobe XMP Specification Part 3 section 1.1.3.1:
                 //   "A robust JPEG reader should tolerate the marker segments in any order."
-                memcpy(&totalXMP.data[standardXMPSize + extendedXMPOffset], &marker->data[AVIF_JPEG_OFFSET_TILL_EXTENDED_XMP], extendedXMPSize);
+                memcpy(&extendedXMP.data[extendedXMPOffset], &marker->data[AVIF_JPEG_OFFSET_TILL_EXTENDED_XMP], extendedXMPSize);
 
                 // Make sure no previously read data was overwritten by the current segment.
                 if (memchr(&extendedXMPReadBytes.data[extendedXMPOffset], 1, extendedXMPSize)) {
@@ -1163,7 +1263,10 @@
             uint8_t xmpNote[AVIF_JPEG_XMP_NOTE_TAG_LENGTH + AVIF_JPEG_EXTENDED_XMP_GUID_LENGTH];
             memcpy(xmpNote, AVIF_JPEG_XMP_NOTE_TAG, AVIF_JPEG_XMP_NOTE_TAG_LENGTH);
             memcpy(xmpNote + AVIF_JPEG_XMP_NOTE_TAG_LENGTH, extendedXMPGUID, AVIF_JPEG_EXTENDED_XMP_GUID_LENGTH);
-            if (!avifJPEGFindSubstr(standardXMPData, standardXMPSize, xmpNote, sizeof(xmpNote))) {
+            avifBool foundAlternativeXMPNote;
+            if (avifJPEGFindSubstr(standardXMPData, standardXMPSize, xmpNote, sizeof(xmpNote))) {
+                foundAlternativeXMPNote = AVIF_FALSE;
+            } else {
                 // Try the alternative before returning an error.
                 uint8_t alternativeXmpNote[AVIF_JPEG_ALTERNATIVE_XMP_NOTE_TAG_LENGTH + AVIF_JPEG_EXTENDED_XMP_GUID_LENGTH];
                 memcpy(alternativeXmpNote, AVIF_JPEG_ALTERNATIVE_XMP_NOTE_TAG, AVIF_JPEG_ALTERNATIVE_XMP_NOTE_TAG_LENGTH);
@@ -1172,17 +1275,24 @@
                     fprintf(stderr, "XMP extraction failed: standard and extended XMP GUID mismatch\n");
                     goto cleanup;
                 }
+                foundAlternativeXMPNote = AVIF_TRUE;
             }
+            (void)foundAlternativeXMPNote;
 
-            // According to Adobe XMP Specification Part 3 section 1.1.3.1:
-            //   "A JPEG reader must [...] remove the xmpNote:HasExtendedXMP property."
-            // This constraint is ignored here because leaving the xmpNote:HasExtendedXMP property is rather harmless
-            // and editing XMP metadata is quite involved.
-
+#if defined(AVIF_ENABLE_JPEG_GAIN_MAP_CONVERSION)
+            if (!avifJPEGMergeXMP(standardXMPData, standardXMPSize, extendedXMP, foundAlternativeXMPNote, &avif->xmp)) {
+                goto cleanup;
+            }
+#else
+            fprintf(stderr, "WARNING: must be compiled with libxml2 to copy extended XMP properly\n");
             avifRWDataFree(&avif->xmp);
-            avif->xmp = totalXMP;
-            totalXMP.data = NULL;
-            totalXMP.size = 0;
+            if (avifRWDataRealloc(&avif->xmp, (size_t)standardXMPSize + extendedXMP.size) != AVIF_RESULT_OK) {
+                fprintf(stderr, "XMP copy failed: out of memory\n");
+                goto cleanup;
+            }
+            memcpy(avif->xmp.data, standardXMPData, standardXMPSize);
+            memcpy(avif->xmp.data + standardXMPSize, extendedXMP.data, extendedXMP.size);
+#endif // AVIF_ENABLE_JPEG_GAIN_MAP_CONVERSION
         } else if (standardXMPData) {
             if (avifImageSetMetadataXMP(avif, standardXMPData, standardXMPSize) != AVIF_RESULT_OK) {
                 fprintf(stderr, "XMP extraction failed: out of memory\n");
@@ -1236,7 +1346,7 @@
     jpeg_destroy_decompress(&cinfo);
     free(iccData);
     avifRGBImageFreePixels(&rgb);
-    avifRWDataFree(&totalXMP);
+    avifRWDataFree(&extendedXMP);
     avifRWDataFree(&extendedXMPReadBytes);
     return ret;
 }
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index dab2c74..96010ee 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -266,6 +266,8 @@
 
     if(AVIF_ENABLE_JPEG_GAIN_MAP_CONVERSION)
         add_cmd_test(test_cmd_avifgainmaputil ${CMAKE_CURRENT_SOURCE_DIR}/data)
+    else()
+        set_tests_properties(test_cmd_stdin PROPERTIES DISABLED True)
     endif()
 
     if(AVIF_ENABLE_GOLDEN_TESTS AND AVIF_CODEC_AOM_ENCODE)
diff --git a/tests/gtest/avifjpeggainmaptest.cc b/tests/gtest/avifjpeggainmaptest.cc
index 5f729f9..a86e0f5 100644
--- a/tests/gtest/avifjpeggainmaptest.cc
+++ b/tests/gtest/avifjpeggainmaptest.cc
@@ -201,6 +201,10 @@
 <?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
 <x:xmpmeta xmlns:x="adobe:ns:meta/">
   <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+    <rdf:Description rdf:about="stuff" xmlns:xmpNote="http://ns.adobe.com/xmp/note/"
+        xmpNote:HasExtendedXMP="F280A6636D6DC3D306B925078C2924D3">
+      <stuff></stuff>
+    </rdf:Description>
     <rdf:Description rdf:about="stuff"
       xmlns:hdrgm="http://ns.adobe.com/hdr-gain-map/1.0/" hdrgm:Version="1.0"
       hdrgm:BaseRenditionIsHDR="False"
@@ -209,16 +213,6 @@
     </rdf:Description>
   </rdf:RDF>
 </x:xmpmeta>
-<?xpacket end="w"?>
-
-<x:xmpmeta xmlns:x="adobe:ns:meta/">
-  <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
-    <!-- Imagine this is some extended xmp that avifenc concatenated to
-      the main XMP. As a result we have invalid XMP but should still be
-      able to parse it. -->
-    <stuff></stuff>
-  </rdf:RDF>
-</x:xmpmeta>
   )";
   GainMapPtr gainMap(avifGainMapCreate());
   ASSERT_TRUE(avifJPEGParseGainMapXMP((const uint8_t*)xmp.data(), xmp.size(),