commit f439609304371eb3dd7ac238f39c470743261597
Author: Enrico Forestieri <[email protected]>
Date: Wed Jun 11 23:04:39 2014 +0200
Make binary file detection more robust.
The magic library can detect the charset used by a file. While this
detection is not full proof, actually the library seems to be infallible
as regards the binary nature of a file. So, use libmagic for the detection
and fallback to the previous method if the library is not installed or
its database cannot be loaded.
diff --git a/src/Format.cpp b/src/Format.cpp
index 44e3f63..063b683 100644
--- a/src/Format.cpp
+++ b/src/Format.cpp
@@ -241,17 +241,13 @@ string guessFormatFromContents(FileName const & fn)
int const max_count = 50;
int count = 0;
- // Maximum number of binary chars allowed for latex detection
- int const max_bin = 5;
-
string str;
string format;
bool firstLine = true;
bool backslash = false;
bool maybelatex = false;
- int binchars = 0;
int dollars = 0;
- while ((count++ < max_count) && format.empty() && binchars <= max_bin) {
+ while ((count++ < max_count) && format.empty() && !maybelatex) {
if (ifs.eof())
break;
@@ -378,17 +374,9 @@ string guessFormatFromContents(FileName const & fn)
// inline equation
maybelatex = true;
}
-
- // Note that this is formally not correct, since count_bin_chars
- // expects utf8, and str can be anything: plain text in any
- // encoding, or really binary data. In practice it works, since
- // QString::fromUtf8() drops invalid utf8 sequences, and while
- // the exact number may not be correct, we still get a high
- // number for truly binary files.
- binchars += count_bin_chars(str);
}
- if (format.empty() && binchars <= max_bin && maybelatex)
+ if (format.empty() && maybelatex && !isBinaryFile(fn))
format = "latex";
if (format.empty()) {
diff --git a/src/support/filetools.cpp b/src/support/filetools.cpp
index b9c7e7f..d167d6a 100644
--- a/src/support/filetools.cpp
+++ b/src/support/filetools.cpp
@@ -43,6 +43,9 @@
#include "support/regex.h"
#include <fcntl.h>
+#ifdef HAVE_MAGIC_H
+#include <magic.h>
+#endif
#include <cerrno>
#include <cstdlib>
@@ -91,6 +94,60 @@ bool isValidDVIFileName(string const & filename)
}
+bool isBinaryFile(FileName const & filename)
+{
+ bool isbinary = false;
+ if (filename.empty() || !filename.exists())
+ return isbinary;
+
+#ifdef HAVE_MAGIC_H
+ magic_t magic_cookie = magic_open(MAGIC_MIME_ENCODING);
+ if (magic_cookie) {
+ bool detected = true;
+ if (magic_load(magic_cookie, NULL) != 0) {
+ LYXERR(Debug::FILES, "isBinaryFile: "
+ "Could not load magic database - "
+ << magic_error(magic_cookie));
+ detected = false;
+ } else {
+ char const *charset = magic_file(magic_cookie,
+
filename.toFilesystemEncoding().c_str());
+ isbinary = contains(charset, "binary");
+ }
+ magic_close(magic_cookie);
+ if (detected)
+ return isbinary;
+ }
+#endif
+ // Try by looking for binary chars at the beginning of the file.
+ // Note that this is formally not correct, since count_bin_chars
+ // expects utf8, and the passed string can be anything: plain text
+ // in any encoding, or really binary data. In practice it works,
+ // since QString::fromUtf8() drops invalid utf8 sequences, and
+ // while the exact number may not be correct, we still get a high
+ // number for truly binary files.
+
+ ifstream ifs(filename.toFilesystemEncoding().c_str());
+ if (!ifs)
+ return isbinary;
+
+ // Maximum strings to read
+ int const max_count = 50;
+
+ // Maximum number of binary chars allowed
+ int const max_bin = 5;
+
+ int count = 0;
+ int binchars = 0;
+ string str;
+ while (count++ < max_count && !ifs.eof()) {
+ getline(ifs, str);
+ binchars += count_bin_chars(str);
+ }
+ return binchars > max_bin;
+}
+
+
string const latex_path(string const & original_path,
latex_path_extension extension,
latex_path_dots dots)
diff --git a/src/support/filetools.h b/src/support/filetools.h
index fbc14f8..9d91f33 100644
--- a/src/support/filetools.h
+++ b/src/support/filetools.h
@@ -78,6 +78,9 @@ bool isValidLaTeXFileName(std::string const & filename);
*/
bool isValidDVIFileName(std::string const & filename);
+/// check whether the file has binary contents
+bool isBinaryFile(FileName const & filename);
+
/** Returns the path of a library data file.
Search the file name.ext in the subdirectory dir of
-# user_lyxdir