VitaliyFilippov has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/70830


Change subject: (bug 35607) Fix OXML type detection
......................................................................

(bug 35607) Fix OXML type detection

Unzip [Content_Types].xml and match its content instead of just searching
first 1024 bytes of a file for [Content_Types].xml and then detecting
individual types based on the extension.

This is needed because Open/LibreOffice saves [Content_Types].xml in the
end of OXML file. MSWord opens such files correctly, but without this patch
MW detects them as application/zip.

Side changes:
* Fix reading $tail - it couldn't be read if file size was < 65558 bytes.
* Fall back to external MIME type detection after unsuccessful ZIP type
  detection instead of just returning 'application/zip' (maybe fileinfo
  knows something we don't :))

Change-Id: I3bf1d5e9d2a5521739f4f73c45ef67090931b420
---
M includes/MimeMagic.php
1 file changed, 50 insertions(+), 6 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core 
refs/changes/30/70830/1

diff --git a/includes/MimeMagic.php b/includes/MimeMagic.php
index 44fafcaf..696a80b 100644
--- a/includes/MimeMagic.php
+++ b/includes/MimeMagic.php
@@ -591,7 +591,8 @@
                        return 'unknown/unknown';
                }
                $head = fread( $f, 1024 );
-               fseek( $f, -65558, SEEK_END );
+               fseek( $f, 0, SEEK_END );
+               fseek( $f, max( 0, ftell( $f ) - 65558 ), SEEK_SET );
                $tail = fread( $f, 65558 ); // 65558 = maximum size of a zip 
EOCDR
                fclose( $f );
 
@@ -726,7 +727,7 @@
                // Check for ZIP variants (before getimagesize)
                if ( strpos( $tail, "PK\x05\x06" ) !== false ) {
                        wfDebug( __METHOD__ . ": ZIP header present in $file\n" 
);
-                       return $this->detectZipType( $head, $tail, $ext );
+                       return $this->detectZipType( $file, $head, $tail, $ext 
);
                }
 
                wfSuppressWarnings();
@@ -754,6 +755,7 @@
         * header data.  Currently works for OpenDocument and OpenXML types...
         * If can't tell, returns 'application/zip'.
         *
+        * @param $file   String: filename
         * @param string $header some reasonably-sized chunk of file header
         * @param $tail   String: the tail of the file
         * @param $ext Mixed: the file extension, or true to extract it from 
the filename.
@@ -762,13 +764,14 @@
         *
         * @return string
         */
-       function detectZipType( $header, $tail = null, $ext = false ) {
+       function detectZipType( $file, $header, $tail = null, $ext = false ) {
                if ( $ext ) { # TODO: remove $ext param
                        wfDebug( __METHOD__ . ": WARNING: use of the \$ext 
parameter is deprecated. " .
                                "Use improveTypeFromExtension(\$mime, \$ext) 
instead.\n" );
                }
 
-               $mime = 'application/zip';
+               // Fall back to external type detection by default
+               $mime = false;
                $opendocTypes = array(
                        'chart-template',
                        'chart',
@@ -793,9 +796,47 @@
 
                $openxmlRegex = "/^\[Content_Types\].xml/";
 
+               $openxmlTypeRegex = 
'/ContentType=[\"\']?(application\/vnd\.(?:'.
+                       
'openxmlformats-officedocument\.wordprocessingml\.document|'.
+                       
'openxmlformats-officedocument\.wordprocessingml\.template|'.
+                       'ms-word\.document\.macroEnabled\.12|'.
+                       'ms-word\.template\.macroEnabled\.12|'.
+                       
'openxmlformats-officedocument\.presentationml\.template|'.
+                       
'openxmlformats-officedocument\.presentationml\.slideshow|'.
+                       
'openxmlformats-officedocument\.presentationml\.presentation|'.
+                       'ms-powerpoint\.addin\.macroEnabled\.12|'.
+                       'ms-powerpoint\.presentation\.macroEnabled\.12|'.
+                       'ms-powerpoint\.presentation\.macroEnabled\.12|'.
+                       'ms-powerpoint\.slideshow\.macroEnabled\.12|'.
+                       'openxmlformats-officedocument\.spreadsheetml\.sheet|'.
+                       
'openxmlformats-officedocument\.spreadsheetml\.template|'.
+                       'ms-excel\.sheet\.macroEnabled\.12|'.
+                       'ms-excel\.template\.macroEnabled\.12|'.
+                       'ms-excel\.addin\.macroEnabled\.12|'.
+                       'ms-excel\.sheet\.binary\.macroEnabled\.12|'.
+                       'ms-xpsdocument))/';
+
                if ( preg_match( $opendocRegex, substr( $header, 30 ), $matches 
) ) {
+                       // 'mimetype' entry is ALWAYS stored in the beginning 
of an ODF file
                        $mime = $matches[1];
                        wfDebug( __METHOD__ . ": detected $mime from ZIP 
archive\n" );
+               } elseif ( function_exists( 'zip_open' ) && is_resource( $zip = 
zip_open( $file ) ) ) {
+                       // MSOffice stores [Content_Types].xml in the beginning 
of OXML files,
+                       // but Open/LibreOffice stores it in the end, so we 
won't find it in $header!
+                       while ( ( $entry = zip_read( $zip ) ) ) {
+                               $fn = strtolower( zip_entry_name( $entry ) );
+                               if ( $fn == '[content_types].xml' &&
+                                               zip_entry_open( $zip, $entry, 
'r' ) ) {
+                                       $n = zip_entry_filesize( $entry );
+                                       $types = zip_entry_read( $entry, $n > 
0x10000 ? 0x10000 : $n );
+                                       zip_entry_close( $entry );
+                                       if ( preg_match( $openxmlTypeRegex, 
$types, $m ) ) {
+                                               $mime = $m[1];
+                                       }
+                                       break;
+                               }
+                       }
+                       zip_close( $zip );
                } elseif ( preg_match( $openxmlRegex, substr( $header, 30 ) ) ) 
{
                        $mime = "application/x-opc+zip";
                        # TODO: remove the block below, as soon as 
improveTypeFromExtension is used everywhere
@@ -814,7 +855,9 @@
                                }
                        }
                        wfDebug( __METHOD__ . ": detected an Open Packaging 
Conventions archive: $mime\n" );
-               } elseif ( substr( $header, 0, 8 ) == 
"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" &&
+               }
+               // OPC trailer may contain only some 'theme', but no real 
document content
+               if ( !$mime && substr( $header, 0, 8 ) == 
"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" &&
                                ( $headerpos = strpos( $tail, "PK\x03\x04" ) ) 
!== false &&
                                preg_match( $openxmlRegex, substr( $tail, 
$headerpos + 30 ) ) ) {
                        if ( substr( $header, 512, 4 ) == "\xEC\xA5\xC1\x00" ) {
@@ -843,7 +886,8 @@
                        }
 
                        wfDebug( __METHOD__ . ": detected a MS Office document 
with OPC trailer\n" );
-               } else {
+               }
+               if ( !$mime ) {
                        wfDebug( __METHOD__ . ": unable to identify type of ZIP 
archive\n" );
                }
                return $mime;

-- 
To view, visit https://gerrit.wikimedia.org/r/70830
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I3bf1d5e9d2a5521739f4f73c45ef67090931b420
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/core
Gerrit-Branch: master
Gerrit-Owner: VitaliyFilippov <vita...@yourcmc.ru>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to