File sniffing patch

Kevin Vajk Mon, 7 Feb 2000 15:05:52 -0600 (CST)


This patch (also sent to the patches mail address) adds file type
detection based on the file contents.  Precedence is given to the
filename suffix, so this code only gets used if the suffix if
unknown or unrecognized.  In the future, I'd like to have a smarter
algorithm, which looks at both the suffix and the file contents,
and tries to find the best match.

UTF8 is not recognized, and there are some TODO's floating
around.  Still, it may be better than nothing.

- Kevin Vajk
  <[EMAIL PROTECTED]>

diff -u -d -r -P abi/src/wp/impexp/xp.orig/ie_imp.cpp abi/src/wp/impexp/xp/ie_imp.cpp
--- abi/src/wp/impexp/xp.orig/ie_imp.cpp        Wed Jan 26 16:22:45 2000
+++ abi/src/wp/impexp/xp/ie_imp.cpp     Mon Feb  7 11:13:03 2000
@@ -38,6 +38,8 @@
 
 struct _imp
 {
+       UT_Bool                 (*fpRecognizeContents)(const char * szBuf,
+                                                       int iNumbytes);
        UT_Bool                 (*fpRecognizeSuffix)(const char * szSuffix);
        UT_Error                (*fpStaticConstructor)(PD_Document * pDocument,
                                                                                   
IE_Imp ** ppie);
@@ -47,16 +49,16 @@
        UT_Bool                 (*fpSupportsFileType)(IEFileType ft);
 };
 
-#define DeclareImporter(n)     { n::RecognizeSuffix, n::StaticConstructor, 
n::GetDlgLabels, n::SupportsFileType }
+#define DeclareImporter(n)     { n::RecognizeContents, n::RecognizeSuffix, 
+n::StaticConstructor, n::GetDlgLabels, n::SupportsFileType }
 
 static struct _imp s_impTable[] =
 {
        DeclareImporter(IE_Imp_AbiWord_1),
       DeclareImporter(IE_Imp_GZipAbiWord),
-       DeclareImporter(IE_Imp_Text),
        DeclareImporter(IE_Imp_RTF),
        DeclareImporter(IE_Imp_MsWord_97),
        DeclareImporter(IE_Imp_UTF8),
+       DeclareImporter(IE_Imp_Text),
 };
 
                
@@ -75,10 +77,38 @@
 /*****************************************************************/
 /*****************************************************************/
 
+IEFileType IE_Imp::fileTypeForContents(const char * szBuf, int iNumbytes)
+{
+       // we have to construct the loop this way because a
+       // given filter could support more than one file type,
+       // so we must query a match for all file types
+       for (UT_uint32 k=0; (k < NrElements(s_impTable)); k++)
+       {
+               struct _imp * s = &s_impTable[k];
+               if (s->fpRecognizeContents(szBuf, iNumbytes))
+               {
+                       for (UT_uint32 a = 0; a < (int) IEFT_LAST_BOGUS; a++)
+                       {
+                               if (s->fpSupportsFileType((IEFileType) a))
+                                       return (IEFileType) a;
+                       }
+
+                       UT_ASSERT(UT_SHOULD_NOT_HAPPEN);
+                       // Hm... an importer recognizes the given data
+                       // but refuses to support any file type we request.
+                       return IEFT_Unknown;
+               }
+       }
+
+       // No filter recognizes this data
+       return IEFT_Unknown;
+       
+}
+
 IEFileType IE_Imp::fileTypeForSuffix(const char * szSuffix)
 {
        if (!szSuffix)
-               return IEFT_Text;
+               return IEFT_Unknown;
        
        // we have to construct the loop this way because a
        // given filter could support more than one file type,
@@ -96,14 +126,13 @@
 
                        UT_ASSERT(UT_SHOULD_NOT_HAPPEN);
                        // Hm... an importer has registered for the given suffix,
-                       // bug refuses to support any file type we request.
-                       // Default to Text.
-                       return IEFT_Text;
+                       // but refuses to support any file type we request.
+                       return IEFT_Unknown;
                }
        }
 
-       // No filter is registered for that extension, try Text for import
-       return IEFT_Text;
+       // No filter is registered for that extension
+       return IEFT_Unknown;
        
 }
 
@@ -120,12 +149,30 @@
        UT_ASSERT(szFilename && *szFilename);
        UT_ASSERT(ppie);
 
-       // no filter will support IEFT_Unknown, so we detect from the
-       // suffix of the filename, the real importer to use and assign
-       // that back to ieft.
+       // no filter will support IEFT_Unknown, so we try to detect
+       // from the contents of the file or the filename suffix
+       // the importer to use and assign that back to ieft.
+       // Give precedence to the file suffix.
        if (ieft == IEFT_Unknown)
        {
                ieft = IE_Imp::fileTypeForSuffix(UT_pathSuffix(szFilename));
+       }
+       if (ieft == IEFT_Unknown)
+       {
+               char szBuf[4096];  // 4096 ought to be enough
+               int iNumbytes;
+               FILE *f;
+               if ( ( f = fopen( szFilename, "r" ) ) != (FILE *)0 )
+               {
+                       iNumbytes = fread(szBuf, 1, sizeof(szBuf), f);
+                       fclose(f);
+                       ieft = IE_Imp::fileTypeForContents(szBuf, iNumbytes);
+               }
+       }
+       // as a last resort, just try importing it as text  :(
+       if (ieft == IEFT_Unknown)
+       {
+               ieft = IEFT_Text ;
        }
 
        UT_ASSERT(ieft != IEFT_Unknown);
diff -u -d -r -P abi/src/wp/impexp/xp.orig/ie_imp.h abi/src/wp/impexp/xp/ie_imp.h
--- abi/src/wp/impexp/xp.orig/ie_imp.h  Wed Jan 26 16:22:45 2000
+++ abi/src/wp/impexp/xp/ie_imp.h       Mon Feb  7 10:33:52 2000
@@ -38,6 +38,9 @@
        // responsible for destroying the importer when finished
        // with it.
 
+       static IEFileType       fileTypeForContents(const char * szBuf,
+                                       int iNumbytes);
+
        static IEFileType       fileTypeForSuffix(const char * szSuffix);
        
        static UT_Error         constructImporter(PD_Document * pDocument,
diff -u -d -r -P abi/src/wp/impexp/xp.orig/ie_imp_AbiWord_1.cpp 
abi/src/wp/impexp/xp/ie_imp_AbiWord_1.cpp
--- abi/src/wp/impexp/xp.orig/ie_imp_AbiWord_1.cpp      Wed Jan 26 16:22:45 2000
+++ abi/src/wp/impexp/xp/ie_imp_AbiWord_1.cpp   Mon Feb  7 11:15:21 2000
@@ -20,6 +20,7 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 #include "ut_types.h"
 #include "ut_assert.h"
 #include "ut_debugmsg.h"
@@ -144,6 +145,40 @@
 
 /*****************************************************************/
 /*****************************************************************/
+
+UT_Bool IE_Imp_AbiWord_1::RecognizeContents(const char * szBuf, int iNumbytes)
+{
+       int iLinesToRead = 6 ;  // Only examine the first few lines of the file
+       int iBytesScanned = 0 ;
+       const char *p ;
+       char *magic ;
+       p = szBuf ;
+       while( iLinesToRead-- )
+       {
+               magic = "<abiword " ;
+               if ( (iNumbytes - iBytesScanned) < strlen(magic) ) return(UT_FALSE);
+               if ( strncmp(p, magic, strlen(magic)) == 0 ) return(UT_TRUE);
+               magic = "<!-- This file is an AbiWord document." ;
+               if ( (iNumbytes - iBytesScanned) < strlen(magic) ) return(UT_FALSE);
+               if ( strncmp(p, magic, strlen(magic)) == 0 ) return(UT_TRUE);
+               /*  Seek to the next newline:  */
+               while ( *p != '\n' && *p != '\r' )
+               {
+                       iBytesScanned++ ; p++ ;
+                       if( iBytesScanned+2 >= iNumbytes ) return(UT_FALSE);
+               }
+               /*  Seek past the next newline:  */
+               if ( *p == '\n' || *p == '\r' )
+               {
+                       iBytesScanned++ ; p++ ;
+                       if ( *p == '\n' || *p == '\r' )
+                       {
+                               iBytesScanned++ ; p++ ;
+                       }
+               }
+       }
+       return(UT_FALSE);
+}
 
 UT_Bool IE_Imp_AbiWord_1::RecognizeSuffix(const char * szSuffix)
 {
diff -u -d -r -P abi/src/wp/impexp/xp.orig/ie_imp_AbiWord_1.h 
abi/src/wp/impexp/xp/ie_imp_AbiWord_1.h
--- abi/src/wp/impexp/xp.orig/ie_imp_AbiWord_1.h        Wed Jan 26 16:22:45 2000
+++ abi/src/wp/impexp/xp/ie_imp_AbiWord_1.h     Mon Feb  7 10:33:52 2000
@@ -48,6 +48,7 @@
     void                               _endElement(const XML_Char *name);
     void                               _charData(const XML_Char*, int);
 
+    static UT_Bool             RecognizeContents(const char * szBuf, int iNumbytes);
     static UT_Bool             RecognizeSuffix(const char * szSuffix);
     static UT_Error            StaticConstructor(PD_Document * pDocument,
            IE_Imp ** ppie);
diff -u -d -r -P abi/src/wp/impexp/xp.orig/ie_imp_GZipAbiWord.cpp 
abi/src/wp/impexp/xp/ie_imp_GZipAbiWord.cpp
--- abi/src/wp/impexp/xp.orig/ie_imp_GZipAbiWord.cpp    Wed Jan 26 16:22:45 2000
+++ abi/src/wp/impexp/xp/ie_imp_GZipAbiWord.cpp Mon Feb  7 10:43:03 2000
@@ -54,6 +54,23 @@
 /*****************************************************************/
 /*****************************************************************/
 
+UT_Bool IE_Imp_GZipAbiWord::RecognizeContents(const char * szBuf, int iNumbytes)
+{
+       // TODO: This is a hack.  Since we're just passed in some
+       // TODO: some data, and not the actual filename, there isn't
+       // TODO: much we can do other than verify that it is gzip'ed
+       // TODO: data.  For the time being, assume that if it is
+       // TODO: gzip'ed, it's gzip'ed abiword.  This assumption will
+       // TODO: be false if and when we support any other compressed
+       // TODO: formats.
+       if ( iNumbytes < 2 ) return(UT_FALSE);
+       if ( ( szBuf[0] == (char)0x1f ) && ( szBuf[1] == (char)0x8b ) )
+       {
+               return(UT_TRUE);
+       }
+       return(UT_FALSE);
+}
+
 UT_Bool IE_Imp_GZipAbiWord::RecognizeSuffix(const char * szSuffix)
 {
     return (UT_stricmp(szSuffix,".zabw") == 0);
diff -u -d -r -P abi/src/wp/impexp/xp.orig/ie_imp_GZipAbiWord.h 
abi/src/wp/impexp/xp/ie_imp_GZipAbiWord.h
--- abi/src/wp/impexp/xp.orig/ie_imp_GZipAbiWord.h      Wed Jan 26 16:22:45 2000
+++ abi/src/wp/impexp/xp/ie_imp_GZipAbiWord.h   Mon Feb  7 10:33:52 2000
@@ -36,6 +36,7 @@
        virtual void            pasteFromBuffer(PD_DocumentRange * pDocRange,
                                                                                
unsigned char * pData, UT_uint32 lenData);
 
+       static UT_Bool          RecognizeContents(const char * szBuf, int iNumbytes);
        static UT_Bool          RecognizeSuffix(const char * szSuffix);
        static UT_Error         StaticConstructor(PD_Document * pDocument,
                                                                                  
IE_Imp ** ppie);
diff -u -d -r -P abi/src/wp/impexp/xp.orig/ie_imp_MsWord_97.cpp 
abi/src/wp/impexp/xp/ie_imp_MsWord_97.cpp
--- abi/src/wp/impexp/xp.orig/ie_imp_MsWord_97.cpp      Sat Feb  5 19:49:37 2000
+++ abi/src/wp/impexp/xp/ie_imp_MsWord_97.cpp   Mon Feb  7 11:02:03 2000
@@ -681,6 +681,82 @@
 /*****************************************************************/
 /*****************************************************************/
 
+UT_Bool IE_Imp_MsWord_97::RecognizeContents(const char * szBuf, int iNumbytes)
+{
+       // TODO: This is rather crude, because we don't parse OLE files.
+       // TODO: For the time being, we assume that any OLE file is an
+       // TODO: msword document.
+       // TODO: Caolan is gonna kill me for this.  :)
+       // Most of the magic numbers here were taken from the public domain
+       // /etc/magic file distributed with the file(1) command written
+       // by Ian F. Darwin, with contributions and magic entries from
+       // Rob McMahon, Guy Harris, Christos Zoulas <[EMAIL PROTECTED]>,
+       // Mark Moraes <[EMAIL PROTECTED]>, and Pawel Wiecek.
+       char *magic ;
+       int magicoffset ;
+       magic = "Microsoft Word 6.0 Document" ;
+       magicoffset = 2080 ;
+       if ( iNumbytes > magicoffset+strlen(magic) )
+       {
+               if ( strncmp(szBuf+magicoffset, magic, strlen(magic)) == 0 )
+               {
+                       return(UT_TRUE);
+               }
+       }
+       magic = "Documento Microsoft Word 6" ;
+       magicoffset = 2080 ;
+       if ( iNumbytes > magicoffset+strlen(magic) )
+       {
+               if ( strncmp(szBuf+magicoffset, magic, strlen(magic)) == 0 )
+               {
+                       return(UT_TRUE);
+               }
+       }
+       magic = "MSWordDoc" ;
+       magicoffset = 2112 ;
+       if ( iNumbytes > magicoffset+strlen(magic) )
+       {
+               if ( strncmp(szBuf+magicoffset, magic, strlen(magic)) == 0 )
+               {
+                       return(UT_TRUE);
+               }
+       }
+       if ( iNumbytes > 8 )
+       {
+               if ( szBuf[0] == (char)0x31 && szBuf[1] == (char)0xbe &&
+                        szBuf[2] == (char)0 && szBuf[3] == (char)0 )
+               {
+                       return(UT_TRUE);
+               }
+               if ( szBuf[0] == 'P' && szBuf[1] == 'O' &&
+                        szBuf[2] == '^' && szBuf[3] == 'Q' && szBuf[4] == '`' )
+               {
+                       return(UT_TRUE);
+               }
+               if ( szBuf[0] == (char)0xfe && szBuf[1] == (char)0x37 &&
+                        szBuf[2] == (char)0 && szBuf[3] == (char)0x23 )
+               {
+                       return(UT_TRUE);
+               }
+               // OLE magic:
+               // TODO: Dig through the OLE file
+               if ( szBuf[0] == (char)0xd0 && szBuf[1] == (char)0xcf &&
+                        szBuf[2] == (char)0x11 && szBuf[3] == (char)0xe0 &&
+                        szBuf[4] == (char)0xa1 && szBuf[5] == (char)0xb1 &&
+                        szBuf[6] == (char)0x1a && szBuf[7] == (char)0xe1 )
+               {
+                       return(UT_TRUE);
+               }
+               if ( szBuf[0] == (char)0xdb && szBuf[1] == (char)0xa5 &&
+                        szBuf[2] == (char)0x2d && szBuf[3] == (char)0 &&
+                        szBuf[4] == (char)0 && szBuf[5] == (char)0 )
+               {
+                       return(UT_TRUE);
+               }
+       }
+       return(UT_FALSE);
+}
+
 UT_Bool IE_Imp_MsWord_97::RecognizeSuffix(const char * szSuffix)
 {
        return (UT_stricmp(szSuffix,".doc") == 0);
diff -u -d -r -P abi/src/wp/impexp/xp.orig/ie_imp_MsWord_97.h 
abi/src/wp/impexp/xp/ie_imp_MsWord_97.h
--- abi/src/wp/impexp/xp.orig/ie_imp_MsWord_97.h        Sat Feb  5 19:49:37 2000
+++ abi/src/wp/impexp/xp/ie_imp_MsWord_97.h     Mon Feb  7 10:33:52 2000
@@ -43,6 +43,7 @@
        virtual void            pasteFromBuffer(PD_DocumentRange * pDocRange,
                                                                                
unsigned char * pData, UT_uint32 lenData);
 
+       static UT_Bool          RecognizeContents(const char * szBuf, int iNumbytes);
        static UT_Bool          RecognizeSuffix(const char * szSuffix);
        static UT_Error         StaticConstructor(PD_Document * pDocument,
                                                                                  
IE_Imp ** ppie);
diff -u -d -r -P abi/src/wp/impexp/xp.orig/ie_imp_RTF.cpp 
abi/src/wp/impexp/xp/ie_imp_RTF.cpp
--- abi/src/wp/impexp/xp.orig/ie_imp_RTF.cpp    Fri Feb  4 14:47:07 2000
+++ abi/src/wp/impexp/xp/ie_imp_RTF.cpp Mon Feb  7 10:33:52 2000
@@ -319,6 +319,19 @@
 /*****************************************************************/
 /*****************************************************************/
 
+UT_Bool IE_Imp_RTF::RecognizeContents(const char * szBuf, int iNumbytes)
+{
+       if ( iNumbytes < 5 )
+       {
+               return(UT_FALSE);
+       }
+       if ( strncmp( szBuf, "{\\rtf", 5 ) == 0 )
+       {
+               return(UT_TRUE) ;
+       }
+       return(UT_FALSE);
+}
+
 UT_Bool IE_Imp_RTF::RecognizeSuffix(const char * szSuffix)
 {
        return (UT_stricmp(szSuffix,".rtf") == 0);
diff -u -d -r -P abi/src/wp/impexp/xp.orig/ie_imp_RTF.h 
abi/src/wp/impexp/xp/ie_imp_RTF.h
--- abi/src/wp/impexp/xp.orig/ie_imp_RTF.h      Tue Feb  1 11:41:09 2000
+++ abi/src/wp/impexp/xp/ie_imp_RTF.h   Mon Feb  7 10:33:52 2000
@@ -175,6 +175,7 @@
        virtual void            pasteFromBuffer(PD_DocumentRange * pDocRange,
                                                                                
unsigned char * pData, UT_uint32 lenData);
 
+       static UT_Bool          RecognizeContents(const char * szBuf, int iNumbytes);
        static UT_Bool          RecognizeSuffix(const char * szSuffix);
        static UT_Error         StaticConstructor(PD_Document * pDocument,
                                                                                  
IE_Imp ** ppie);
diff -u -d -r -P abi/src/wp/impexp/xp.orig/ie_imp_Text.cpp 
abi/src/wp/impexp/xp/ie_imp_Text.cpp
--- abi/src/wp/impexp/xp.orig/ie_imp_Text.cpp   Wed Jan 26 16:22:45 2000
+++ abi/src/wp/impexp/xp/ie_imp_Text.cpp        Mon Feb  7 11:03:49 2000
@@ -237,6 +237,14 @@
 /*****************************************************************/
 /*****************************************************************/
 
+UT_Bool IE_Imp_Text::RecognizeContents(const char * szBuf, int iNumbytes)
+{
+       // TODO: We give the other guys a chance, since this
+       // TODO: importer is so generic.  Does this seem
+       // TODO: like a sensible strategy?
+       return(UT_FALSE);
+}
+
 UT_Bool IE_Imp_Text::RecognizeSuffix(const char * szSuffix)
 {
        return (UT_stricmp(szSuffix,".txt") == 0);
diff -u -d -r -P abi/src/wp/impexp/xp.orig/ie_imp_Text.h 
abi/src/wp/impexp/xp/ie_imp_Text.h
--- abi/src/wp/impexp/xp.orig/ie_imp_Text.h     Wed Jan 26 16:22:45 2000
+++ abi/src/wp/impexp/xp/ie_imp_Text.h  Mon Feb  7 10:33:52 2000
@@ -37,6 +37,7 @@
        virtual void            pasteFromBuffer(PD_DocumentRange * pDocRange,
                                                                                
unsigned char * pData, UT_uint32 lenData);
 
+       static UT_Bool          RecognizeContents(const char * szBuf, int iNumbytes);
        static UT_Bool          RecognizeSuffix(const char * szSuffix);
        static UT_Error         StaticConstructor(PD_Document * pDocument,
                                                                                  
IE_Imp ** ppie);
diff -u -d -r -P abi/src/wp/impexp/xp.orig/ie_imp_UTF8.cpp 
abi/src/wp/impexp/xp/ie_imp_UTF8.cpp
--- abi/src/wp/impexp/xp.orig/ie_imp_UTF8.cpp   Wed Jan 26 16:22:45 2000
+++ abi/src/wp/impexp/xp/ie_imp_UTF8.cpp        Mon Feb  7 11:04:05 2000
@@ -302,6 +302,12 @@
 /*****************************************************************/
 /*****************************************************************/
 
+UT_Bool IE_Imp_UTF8::RecognizeContents(const char * szBuf, int iNumbytes)
+{
+       // TODO: Not yet written
+       return(UT_FALSE);
+}
+
 UT_Bool IE_Imp_UTF8::RecognizeSuffix(const char * szSuffix)
 {
        return (UT_stricmp(szSuffix,".utf8") == 0);
diff -u -d -r -P abi/src/wp/impexp/xp.orig/ie_imp_UTF8.h 
abi/src/wp/impexp/xp/ie_imp_UTF8.h
--- abi/src/wp/impexp/xp.orig/ie_imp_UTF8.h     Wed Jan 26 16:22:45 2000
+++ abi/src/wp/impexp/xp/ie_imp_UTF8.h  Mon Feb  7 10:33:52 2000
@@ -37,6 +37,7 @@
        virtual void            pasteFromBuffer(PD_DocumentRange * pDocRange,
                                                                                
unsigned char * pData, UT_uint32 lenData);
 
+       static UT_Bool          RecognizeContents(const char * szBuf, int iNumbytes);
        static UT_Bool          RecognizeSuffix(const char * szSuffix);
        static UT_Error         StaticConstructor(PD_Document * pDocument,
                                                                                  
IE_Imp ** ppie);

File sniffing patch

Reply via email to