filter/Library_xmlfd.mk                        |    1 
 filter/source/xmlfilterdetect/filterdetect.cxx |   47 +++++++++++++++++++++++--
 2 files changed, 45 insertions(+), 3 deletions(-)

New commits:
commit f4a693ef5f2f07dc2817f9a28183cf7c5a7bb7e6
Author: Mike Kaganski <[email protected]>
Date:   Tue Apr 4 22:37:45 2017 +0300

    tdf#106955: Detect XML by MediaType
    
    According to Extensible Markup Language (XML) 1.0 (see
    https://www.w3.org/TR/2008/REC-xml-20081126/#sec-prolog-dtd),
    all parts of XML prolog (including XML declaration) are optional,
    so XML stream without <?xml ... ?> is well-formed (though not
    valid).
    
    XMLFilterDetect uses only XML declaration to detect if the file is
    to be processed further. However, this creates problems with said
    documents.
    
    This commit checks if the document has MediaType set to one of
    known XML media types, in case when the check for XML declaration
    failed.
    
    Change-Id: I31627c0e3a39bee241f609650280ebac3f1cede8
    Reviewed-on: https://gerrit.libreoffice.org/36101
    Tested-by: Jenkins <[email protected]>
    Reviewed-by: Mike Kaganski <[email protected]>
    (cherry picked from commit 156f778593ca9c57845076a88c6b544a63e12e7a)
    Reviewed-on: https://gerrit.libreoffice.org/36133
    Tested-by: Mike Kaganski <[email protected]>

diff --git a/filter/Library_xmlfd.mk b/filter/Library_xmlfd.mk
index 54fdd0aa01e9..601a7678312b 100644
--- a/filter/Library_xmlfd.mk
+++ b/filter/Library_xmlfd.mk
@@ -30,6 +30,7 @@ $(eval $(call gb_Library_use_libraries,xmlfd,\
        cppuhelper \
        cppu \
        sal \
+       svl \
        utl \
        tl \
        $(gb_UWINAPI) \
diff --git a/filter/source/xmlfilterdetect/filterdetect.cxx 
b/filter/source/xmlfilterdetect/filterdetect.cxx
index 8ce2522dfb2c..3e13cb17d607 100644
--- a/filter/source/xmlfilterdetect/filterdetect.cxx
+++ b/filter/source/xmlfilterdetect/filterdetect.cxx
@@ -27,6 +27,7 @@
 #include <cppuhelper/supportsservice.hxx>
 #include <ucbhelper/content.hxx>
 #include <unotools/ucbstreamhelper.hxx>
+#include <svl/inettype.hxx>
 #include <memory>
 
 using namespace com::sun::star::container;
@@ -49,6 +50,25 @@ OUString supportedByType( const OUString& clipBoardFormat,  
const OUString& resu
     return sTypeName;
 }
 
+bool IsMediaTypeXML( const OUString& mediaType )
+{
+    if (!mediaType.isEmpty())
+    {
+        OUString sType, sSubType;
+        INetContentTypes::parse(mediaType, sType, sSubType);
+        if (sType.equalsIgnoreAsciiCase("application"))
+        {
+            // RFC 3023: application/xml; don't detect text/xml
+            if (sSubType.equalsIgnoreAsciiCase("xml"))
+                return true;
+            // Registered media types: application/XXXX+xml
+            if (sSubType.endsWithIgnoreAsciiCase("+xml"))
+                return true;
+        }
+    }
+    return false;
+}
+
 }
 
 OUString SAL_CALL FilterDetect::detect( css::uno::Sequence< 
css::beans::PropertyValue >& aArguments ) throw( css::uno::RuntimeException, 
std::exception )
@@ -125,9 +145,30 @@ OUString SAL_CALL FilterDetect::detect( 
css::uno::Sequence< css::beans::Property
             resultString = read_uInt16s_ToOUString( *pInStream, nSize );
 
         if ( !resultString.startsWith( "<?xml" ) )
-            // This is not an XML stream.  It makes no sense to try to detect
-            // a non-XML file type here.
-            return OUString();
+        {
+            // Check the content type; XML declaration is optional in XML 
files according to XML 1.0 ch.2.8
+            // (see 
https://www.w3.org/TR/2008/REC-xml-20081126/#sec-prolog-dtd)
+            OUString sMediaType;
+            try
+            {
+                ::ucbhelper::Content aContent(
+                    sUrl, Reference< css::ucb::XCommandEnvironment >(),
+                    mxCtx);
+                aContent.getPropertyValue("MediaType") >>= sMediaType;
+                if (sMediaType.isEmpty())
+                {
+                    aContent.getPropertyValue("Content-Type") >>= sMediaType;
+                }
+            }
+            catch (...) {}
+
+            if (!IsMediaTypeXML(sMediaType))
+            {
+                // This is not an XML stream.  It makes no sense to try to 
detect
+                // a non-XML file type here.
+                return OUString();
+            }
+        }
 
         // test typedetect code
         Reference <XNameAccess> 
xTypeCont(mxCtx->getServiceManager()->createInstanceWithContext("com.sun.star.document.TypeDetection",
 mxCtx), UNO_QUERY);
_______________________________________________
Libreoffice-commits mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/libreoffice-commits

Reply via email to