commit f439609304371eb3dd7ac238f39c470743261597
Author: Enrico Forestieri <for...@lyx.org>
Date:   Wed Jun 11 23:04:39 2014 +0200

    Make binary file detection more robust.
    
    The magic library can detect the charset used by a file. While this
    detection is not full proof, actually the library seems to be infallible
    as regards the binary nature of a file. So, use libmagic for the detection
    and fallback to the previous method if the library is not installed or
    its database cannot be loaded.

diff --git a/src/Format.cpp b/src/Format.cpp
index 44e3f63..063b683 100644
--- a/src/Format.cpp
+++ b/src/Format.cpp
@@ -241,17 +241,13 @@ string guessFormatFromContents(FileName const & fn)
        int const max_count = 50;
        int count = 0;
 
-       // Maximum number of binary chars allowed for latex detection
-       int const max_bin = 5;
-
        string str;
        string format;
        bool firstLine = true;
        bool backslash = false;
        bool maybelatex = false;
-       int binchars = 0;
        int dollars = 0;
-       while ((count++ < max_count) && format.empty() && binchars <= max_bin) {
+       while ((count++ < max_count) && format.empty() && !maybelatex) {
                if (ifs.eof())
                        break;
 
@@ -378,17 +374,9 @@ string guessFormatFromContents(FileName const & fn)
                                // inline equation
                                maybelatex = true;
                }
-
-               // Note that this is formally not correct, since count_bin_chars
-               // expects utf8, and str can be anything: plain text in any
-               // encoding, or really binary data. In practice it works, since
-               // QString::fromUtf8() drops invalid utf8 sequences, and while
-               // the exact number may not be correct, we still get a high
-               // number for truly binary files.
-               binchars += count_bin_chars(str);
        }
 
-       if (format.empty() && binchars <= max_bin && maybelatex)
+       if (format.empty() && maybelatex && !isBinaryFile(fn))
                format = "latex";
 
        if (format.empty()) {
diff --git a/src/support/filetools.cpp b/src/support/filetools.cpp
index b9c7e7f..d167d6a 100644
--- a/src/support/filetools.cpp
+++ b/src/support/filetools.cpp
@@ -43,6 +43,9 @@
 #include "support/regex.h"
 
 #include <fcntl.h>
+#ifdef HAVE_MAGIC_H
+#include <magic.h>
+#endif
 
 #include <cerrno>
 #include <cstdlib>
@@ -91,6 +94,60 @@ bool isValidDVIFileName(string const & filename)
 }
 
 
+bool isBinaryFile(FileName const & filename)
+{
+       bool isbinary = false;
+       if (filename.empty() || !filename.exists())
+               return isbinary;
+
+#ifdef HAVE_MAGIC_H
+       magic_t magic_cookie = magic_open(MAGIC_MIME_ENCODING);
+       if (magic_cookie) {
+               bool detected = true;
+               if (magic_load(magic_cookie, NULL) != 0) {
+                       LYXERR(Debug::FILES, "isBinaryFile: "
+                               "Could not load magic database - "
+                               << magic_error(magic_cookie));
+                       detected = false;
+               } else {
+                       char const *charset = magic_file(magic_cookie,
+                                       
filename.toFilesystemEncoding().c_str());
+                       isbinary = contains(charset, "binary");
+               }
+               magic_close(magic_cookie);
+               if (detected)
+                       return isbinary;
+       }
+#endif
+       // Try by looking for binary chars at the beginning of the file.
+       // Note that this is formally not correct, since count_bin_chars
+       // expects utf8, and the passed string can be anything: plain text
+       // in any encoding, or really binary data. In practice it works,
+       // since QString::fromUtf8() drops invalid utf8 sequences, and
+       // while the exact number may not be correct, we still get a high
+       // number for truly binary files.
+
+       ifstream ifs(filename.toFilesystemEncoding().c_str());
+       if (!ifs)
+               return isbinary;
+
+       // Maximum strings to read
+       int const max_count = 50;
+
+       // Maximum number of binary chars allowed
+       int const max_bin = 5;
+
+       int count = 0;
+       int binchars = 0;
+       string str;
+       while (count++ < max_count && !ifs.eof()) {
+               getline(ifs, str);
+               binchars += count_bin_chars(str);
+       }
+       return binchars > max_bin;
+}
+
+
 string const latex_path(string const & original_path,
                latex_path_extension extension,
                latex_path_dots dots)
diff --git a/src/support/filetools.h b/src/support/filetools.h
index fbc14f8..9d91f33 100644
--- a/src/support/filetools.h
+++ b/src/support/filetools.h
@@ -78,6 +78,9 @@ bool isValidLaTeXFileName(std::string const & filename);
 */
 bool isValidDVIFileName(std::string const & filename);
 
+/// check whether the file has binary contents
+bool isBinaryFile(FileName const & filename);
+
 /** Returns the path of a library data file.
     Search the file name.ext in the subdirectory dir of
       -# user_lyxdir

Reply via email to