Hi!

Starting from scratch would be possibly the best anyway. I had it also on my 
todo list on a very low priority ... but just, because I found that jMimeMagic 
has a really worse implemenattion - extremly slow and not working correctly. I 
have a good pile of image files it does not detect. Main reason is, that the 
implementation is simply wrong. The original magic files have a clear idea of 
precedence of patterns - this has been lost completely in the 
conversion/implementation of jMimeMagic.

- Jörg

Using the original magic file and parse it in Java also makes it easier to keep it updated. Just add the newest magic file to the jar file and we are done.

I have a (pretty C-like) Python port of file(1). Maybe that can be of some help, implementing it in Java. I don't know about the license of that code though.

The C implementation can be found here:

http://www.opensource.apple.com/darwinsource/Current/file-6/file/src/

The meat is in softmagic.c.  It under a BSD license.

   /Markus

# Found on a russian zope mailing list, and modified to fix bugs in parsing

# the magic file and string making

# -- Daniel Berlin <[EMAIL PROTECTED]>

import sys, struct, time, re, exceptions, pprint, stat, os, pwd, grp

_mew = 0

# _magic='/tmp/magic'

# _magic='/usr/share/magic.mime'

#_magic='/usr/share/magic.mime'

_magic = "magic.mime"
mime = 1

_ldate_adjust = lambda x: time.mktime( time.gmtime(x) )

BUFFER_SIZE = 1024 * 128 # 128K should be enough...


class MagicError(exceptions.Exception): pass

def _handle(fmt='@x',adj=None): return fmt, struct.calcsize(fmt), adj

KnownTypes = {
        # 'byte':_handle('@b'),

        'byte':_handle('@B'),
        'ubyte':_handle('@B'),

        'string':('s',0,None),
        'pstring':_handle('p'),

#       'short':_handle('@h'),

#       'beshort':_handle('>h'),

#       'leshort':_handle('<h'),

        'short':_handle('@H'),
        'beshort':_handle('>H'),
        'leshort':_handle('<H'),
        'ushort':_handle('@H'),
        'ubeshort':_handle('>H'),
        'uleshort':_handle('<H'),

        'long':_handle('@l'),
        'belong':_handle('>l'),
        'lelong':_handle('<l'),
        'ulong':_handle('@L'),
        'ubelong':_handle('>L'),
        'ulelong':_handle('<L'),

        'date':_handle('=l'),
        'bedate':_handle('>l'),
        'ledate':_handle('<l'),
        'ldate':_handle('=l',_ldate_adjust),
        'beldate':_handle('>l',_ldate_adjust),
        'leldate':_handle('<l',_ldate_adjust),
}

_mew_cnt = 0
def mew(x):
    global _mew_cnt
    if _mew :
        if x=='.' :
            _mew_cnt += 1
            if _mew_cnt % 64 == 0 : sys.stderr.write( '\n' )
            sys.stderr.write( '.' )
        else:
            sys.stderr.write( '\b'+x )

def has_format(s):
    n = 0
    l = None
    for c in s :
        if c == '%' :
            if l == '%' : n -= 1
            else        : n += 1
        l = c
    return n

def read_asciiz(file,size=None,pos=None):
    s = []
    if pos :
        mew('s')
        file.seek( pos, 0 )
    mew('z')
    if size is not None :
        s = [file.read( size ).split('\0')[0]]
    else:
        while 1 :
            c = file.read(1)
            if (not c) or (ord(c)==0) or (c=='\n') : break
            s.append (c)
    mew('Z')
    return ''.join(s)

def a2i(v,base=0):
    if v[-1:] in 'lL' : v = v[:-1]
    return int( v, base )

_cmap = {
        '\\' : '\\',
        '0' : '\0',
}
for c in range(ord('a'),ord('z')+1) :
    try               : e = eval('"\\%c"' % chr(c))
    except ValueError : pass
    else              : _cmap[chr(c)] = e
else:
    del c
    del e

def make_string(s):
    return eval( '"'+s.replace('"','\\"')+'"')

class MagicTestError(MagicError): pass

class MagicTest:
    def __init__(self,offset,mtype,test,message,line=None,level=None):
        self.line, self.level = line, level
        self.mtype = mtype
        self.mtest = test
        self.subtests = []
        self.mask = None
        self.smod = None
        self.nmod = None
        self.offset, self.type, self.test, self.message = \
                        offset,mtype,test,message
        if self.mtype == 'true' : return # XXX hack to enable level skips

        if test[-1:]=='\\' and test[-2:]!='\\\\' :
            self.test += 'n' # looks like someone wanted EOL to match?

        if mtype[:6]=='string' :
            if '/' in mtype : # for strings

                self.type, self.smod = \
                                        mtype[:mtype.find('/')], mtype[mtype.find('/')+1:]
        else:
            for nm in '&+-' :
                if nm in mtype : # for integer-based

                    self.nmod, self.type, self.mask = (
                            nm,
                            mtype[:mtype.find(nm)],
                            # convert mask to int, autodetect base

                            int( mtype[mtype.find(nm)+1:], 0 )
                    )
                    break
        self.struct, self.size, self.cast = KnownTypes[ self.type ]
    def __str__(self):
        return '%s %s %s %s' % (
                self.offset, self.mtype, self.mtest, self.message
        )
    def __repr__(self):
        return 'MagicTest(%s,%s,%s,%s,line=%s,level=%s,subtests=\n%s%s)' % (
                `self.offset`, `self.mtype`, `self.mtest`, `self.message`,
                `self.line`, `self.level`,
                '\t'*self.level, pprint.pformat(self.subtests)
        )
    def run(self,file):
        result = ''
        do_close = 0
        try:
            if type(file) == type('x') :
                file = open( file, 'r', BUFFER_SIZE )
                do_close = 1
#                       else:

#                               saved_pos = file.tell()

            if self.mtype != 'true' :
                data = self.read(file)
                last = file.tell()
            else:
                data = last = None
            if self.check( data ) :
                result = self.message+' '
                if has_format( result ) : result %= data
                for test in self.subtests :
                    m = test.run(file)
                    if m is not None : result += m
                return make_string( result )
        finally:
            if do_close :
                file.close()
#                       else:

#                               file.seek( saved_pos, 0 )

    def get_mod_and_value(self):
        if self.type[-6:] == 'string' :
            # "something like\tthis\n"

            if self.test[0] in '=<>' :
                mod, value = self.test[0], make_string( self.test[1:] )
            else:
                mod, value = '=', make_string( self.test )
        else:
            if self.test[0] in '=<>&^' :
                mod, value = self.test[0], a2i(self.test[1:])
            elif self.test[0] == 'x':
                mod = self.test[0]
                value = 0
            else:
                mod, value = '=', a2i(self.test)
        return mod, value
    def read(self,file):
        mew( 's' )
        file.seek( self.offset(file), 0 ) # SEEK_SET

        mew( 'r' )
        try:
            data = rdata = None
            # XXX self.size might be 0 here...

            if self.size == 0 :
                # this is an ASCIIZ string...

                size = None
                if self.test != '>\\0' : # magic's hack for string read...

                    value = self.get_mod_and_value()[1]
                    size = (value=='\0') and None or len(value)
                rdata = data = read_asciiz( file, size=size )
            else:
                rdata = file.read( self.size )
                if not rdata or (len(rdata)!=self.size) : return None
                data = struct.unpack( self.struct, rdata )[0] # XXX hack??

        except:
            print >>sys.stderr, self
            print >>sys.stderr, '@%s struct=%s size=%d rdata=%s' % (
                    self.offset, `self.struct`, self.size,`rdata`)
            raise
        mew( 'R' )
        if self.cast : data = self.cast( data )
        if self.mask :
            try:
                if   self.nmod == '&' : data &= self.mask
                elif self.nmod == '+' : data += self.mask
                elif self.nmod == '-' : data -= self.mask
                else: raise MagicTestError(self.nmod)
            except:
                print >>sys.stderr,'data=%s nmod=%s mask=%s' % (
                        `data`, `self.nmod`, `self.mask`
                )
                raise
        return data
    def check(self,data):
        mew('.')
        if self.mtype == 'true' :
            return '' # not None !

        mod, value = self.get_mod_and_value()
        if self.type[-6:] == 'string' :
            # "something like\tthis\n"

            if self.smod :
                xdata = data
                if 'b' in self.smod : # all blanks are optional

                    xdata = ''.join( data.split() )
                    value = ''.join( value.split() )
                if 'c' in self.smod : # all blanks are optional

                    xdata = xdata.upper()
                    value = value.upper()
            # if 'B' in self.smod : # compact blanks

            ### XXX sorry, i don't understand this :-(

            #       data = ' '.join( data.split() )

            #       if ' ' not in data : return None

            else:
                xdata = data
        try:
            if   mod == '=' : result = data == value
            elif mod == '<' : result = data < value
            elif mod == '>' : result = data > value
            elif mod == '&' : result = data & value
            elif mod == '^' : result = (data & (~value)) == 0
            elif mod == 'x' : result = 1
            else            : raise MagicTestError(self.test)
            if result :
                zdata, zval = `data`, `value`
                if self.mtype[-6:]!='string' :
                    try: zdata, zval = hex(data), hex(value)
                    except: zdata, zval = `data`, `value`
                if 0 : print >>sys.stderr, '%s @%s %s:%s %s %s => %s (%s)' % (
                        '>'*self.level, self.offset,
                        zdata, self.mtype, `mod`, zval, `result`,
                        self.message
                )
            return result
        except:
            print >>sys.stderr,'mtype=%s data=%s mod=%s value=%s' % (
                    `self.mtype`, `data`, `mod`, `value`
            )
            raise
    def add(self,mt):
        if not isinstance(mt,MagicTest) :
            raise MagicTestError((mt,'incorrect subtest type %s'%(type(mt),)))
        if mt.level == self.level+1 :
            self.subtests.append( mt )
        elif self.subtests :
            self.subtests[-1].add( mt )
        elif mt.level > self.level+1 :
            # it's possible to get level 3 just after level 1 !!! :-(

            level = self.level + 1
            while level < mt.level :
                xmt = MagicTest(None,'true','x','',line=self.line,level=level)
                self.add( xmt )
                level += 1
            else:
                self.add( mt ) # retry...

        else:
            raise MagicTestError((mt,'incorrect subtest level %s'%(`mt.level`,)))
    def last_test(self):
        return self.subtests[-1]
#end class MagicTest


class OffsetError(MagicError): pass

class Offset:
    pos_format = {'b':'<B','B':'>B','s':'<H','S':'>H','l':'<I','L':'>I',}
    pattern0 = re.compile(r'''    # mere offset

                ^
                &?                                          # possible ampersand

                (       0                                       # just zero

                |       [1-9]{1,1}[0-9]*        # decimal

                |       0[0-7]+                         # octal

                |       0x[0-9a-f]+                     # hex

                )
                $
                ''', re.X|re.I
    )
    pattern1 = re.compile(r'''    # indirect offset

                ^\(
                (?P<base>&?0                  # just zero

                        |&?[1-9]{1,1}[0-9]* # decimal

                        |&?0[0-7]*          # octal

                        |&?0x[0-9A-F]+      # hex

                )
                (?P<type>
                        \.         # this dot might be alone

                        [BSL]? # one of this chars in either case

                )?
                (?P<sign>
                        [-+]{0,1}
                )?
                (?P<off>0              # just zero

                        |[1-9]{1,1}[0-9]*  # decimal

                        |0[0-7]*           # octal

                        |0x[0-9a-f]+       # hex

                )?
                \)$''', re.X|re.I
    )
    def __init__(self,s):
        self.source = s
        self.value  = None
        self.relative = 0
        self.base = self.type = self.sign = self.offs = None
        m = Offset.pattern0.match( s )
        if m : # just a number

            if s[0] == '&' :
                self.relative, self.value = 1, int( s[1:], 0 )
            else:
                self.value = int( s, 0 )
            return
        m = Offset.pattern1.match( s )
        if m : # real indirect offset

            try:
                self.base = m.group('base')
                if self.base[0] == '&' :
                    self.relative, self.base = 1, int( self.base[1:], 0 )
                else:
                    self.base = int( self.base, 0 )
                if m.group('type') : self.type = m.group('type')[1:]
                self.sign = m.group('sign')
                if m.group('off') : self.offs = int( m.group('off'), 0 )
                if self.sign == '-' : self.offs = 0 - self.offs
            except:
                print >>sys.stderr, '$$', m.groupdict()
                raise
            return
        raise OffsetError(`s`)
    def __call__(self,file=None):
        if self.value is not None : return self.value
        pos = file.tell()
        try:
            if not self.relative : file.seek( self.offset, 0 )
            frmt = Offset.pos_format.get( self.type, 'I' )
            size = struct.calcsize( frmt )
            data = struct.unpack( frmt, file.read( size ) )
            if self.offs : data += self.offs
            return data
        finally:
            file.seek( pos, 0 )
    def __str__(self): return self.source
    def __repr__(self): return 'Offset(%s)' % `self.source`
#end class Offset


class MagicFileError(MagicError): pass

class MagicFile:
    def __init__(self,filename=_magic):
        self.file = None
        self.tests = []
        self.total_tests = 0
        self.load( filename )
        self.ack_tests = None
        self.nak_tests = None
    def __del__(self):
        self.close()
    def load(self,filename=None):
        self.open( filename )
        self.parse()
        self.close()
    def open(self,filename=None):
        self.close()
        if filename is not None :
            self.filename = filename
        self.file = open( self.filename, 'r', BUFFER_SIZE )
    def close(self):
        if self.file :
            self.file.close()
            self.file = None
    def parse(self):
        line_no = 0
        for line in self.file.xreadlines() :
            line_no += 1
            if not line or line[0]=='#' : continue

            line = line.lstrip().rstrip('\r\n')
            if not line or line[0]=='#' : continue

            try:
                x = self.parse_line( line )
                if x is None :
                    print >>sys.stderr, '#[%04d]#'%line_no, line

                    continue
            except:
                print >>sys.stderr, '###[%04d]###'%line_no, line

                raise
            self.total_tests += 1
            level, offset, mtype, test, message = x
            new_test = MagicTest(offset,mtype,test,message,
                    line=line_no,level=level)
            try:
                if level == 0 :
                    self.tests.append( new_test )
                else:
                    self.tests[-1].add( new_test )
            except:
                if 1 :
                    print >>sys.stderr, 'total tests=%s' % (
                            `self.total_tests`,
                    )
                    print >>sys.stderr, 'level=%s' % (
                            `level`,
                    )
                    print >>sys.stderr, 'tests=%s' % (
                            pprint.pformat(self.tests),
                    )
                raise
        else:
            while self.tests[-1].level > 0 :
                self.tests.pop()
    def parse_line(self,line):
        # print >>sys.stderr, 'line=[%s]' % line

        if (not line) or line[0]=='#' : return None

        level = 0
        offset = mtype = test = message = ''
        mask = None
        # get optional level (count leading '>')

        while line and line[0]=='>' :
            line, level = line[1:], level+1
        # get offset

        while line and not line[0].isspace() :
            offset, line = offset+line[0], line[1:]
        try:
            offset = Offset(offset)
        except:
            print >>sys.stderr, 'line=[%s]' % line
            raise
        # skip spaces

        line = line.lstrip()
        # get type

        c = None
        while line :
            last_c, c, line = c, line[0], line[1:]
            if last_c!='\\' and c.isspace() :
                break # unescaped space - end of field

            else:
                mtype += c
                if last_c == '\\' :
                    c = None # don't fuck my brain with sequential backslashes

        # skip spaces

        line = line.lstrip()
        # get test

        c = None
        while line :
            last_c, c, line = c, line[0], line[1:]
            if last_c!='\\' and c.isspace() :
                break # unescaped space - end of field

            else:
                test += c
                if last_c == '\\' :
                    c = None # don't fuck my brain with sequential backslashes

        # skip spaces

        line = line.lstrip()
        # get message

        message = line
        if mime and line.find("\t") != -1:
            message=line[0:line.find("\t")]
        #

        # print '>>', level, offset, mtype, test, message

        return level, offset, mtype, test, message
    def detect(self,file):
        self.ack_tests = 0
        self.nak_tests = 0
        answers = []
        for test in self.tests :
            message = test.run( file )
            if message :
                self.ack_tests += 1
                answers.append( message )
            else:
                self.nak_tests += 1
        if answers :
            return '; '.join( answers )
#end class MagicFile


def username(uid):
    try:
        return pwd.getpwuid( uid )[0]
    except:
        return '#%s'%uid


def groupname(gid):
    try:
        return grp.getgrgid( gid )[0]
    except:
        return '#%s'%gid


def get_file_type(fname,follow):
    t = None
    if not follow :
        try:
            st = os.lstat( fname ) # stat that entry, don't follow links!

        except os.error, why :
            pass
        else:
            if stat.S_ISLNK(st[stat.ST_MODE]) :
                t = 'symbolic link'
                try:
                    lnk = os.readlink( fname )
                except:
                    t += ' (unreadable)'
                else:
                    t += ' to '+lnk
    if t is None :
        try:
            st = os.stat( fname )
        except os.error, why :
            return "can't stat `%s' (%s)." % (why.filename,why.strerror)

    dmaj, dmin = (st.st_rdev>>8)&0x0FF, st.st_rdev&0x0FF

    if 0 : pass
    elif stat.S_ISSOCK(st.st_mode) : t = 'socket'
    elif stat.S_ISLNK (st.st_mode) : t = follow and 'symbolic link' or t
    elif stat.S_ISREG (st.st_mode) : t = 'file'
    elif stat.S_ISBLK (st.st_mode) : t = 'block special (%d/%d)'%(dmaj,dmin)
    elif stat.S_ISDIR (st.st_mode) : t = 'directory'
    elif stat.S_ISCHR (st.st_mode) : t = 'character special (%d/%d)'%(dmaj,dmin)
    elif stat.S_ISFIFO(st.st_mode) : t = 'pipe'
    else: t = '<unknown>'

    if st.st_mode & stat.S_ISUID :
        t = 'setuid(%d=%s) %s'%(st.st_uid,username(st.st_uid),t)
    if st.st_mode & stat.S_ISGID :
        t = 'setgid(%d=%s) %s'%(st.st_gid,groupname(st.st_gid),t)
    if st.st_mode & stat.S_ISVTX :
        t = 'sticky '+t

    return t

HELP = '''%s [options] [files...]

Options:

        -?, --help -- this help
        -m, --magic=<file> -- use this magic <file> instead of %s
        -f, --files=<namefile> -- read filenames for <namefile>
*       -C, --compile -- write "compiled" magic file
        -b, --brief -- don't prepend filenames to output lines
+       -c, --check -- check the magic file
        -i, --mime -- output MIME types
*       -k, --keep-going -- don't stop st the first match
        -n, --flush -- flush stdout after each line
        -v, --verson -- print version and exit
*       -z, --compressed -- try to look inside compressed files
        -L, --follow -- follow symlinks
        -s, --special -- don't skip special files

*       -- not implemented so far ;-)
+       -- implemented, but in another way...
'''

def main():
    import getopt
    global _magic
    try:
        brief = 0
        flush = 0
        follow= 0
        mime  = 0
        check = 0
        special=0
        try:
            opts, args = getopt.getopt(
                    sys.argv[1:],
                    '?m:f:CbciknvzLs',
                    (       'help',
                            'magic=',
                            'names=',
                            'compile',
                            'brief',
                            'check',
                            'mime',
                            'keep-going',
                            'flush',
                            'version',
                            'compressed',
                            'follow',
                            'special',
                    )
            )
        except getopt.error, why:
            print >>sys.stderr, sys.argv[0], why
            return 1
        else:
            files = None
            for o,v in opts :
                if o in ('-?','--help'):
                    print HELP % (
                            sys.argv[0],
                            _magic,
                    )
                    return 0
                elif o in ('-f','--files='):
                    files = v
                elif o in ('-m','--magic='):
                    _magic = v[:]
                elif o in ('-C','--compile'):
                    pass
                elif o in ('-b','--brief'):
                    brief = 1
                elif o in ('-c','--check'):
                    check = 1
                elif o in ('-i','--mime'):
                    mime = 1
                    if os.path.exists( _magic+'.mime' ) :
                        _magic += '.mime'
                        print >>sys.stderr,sys.argv[0]+':',\
                                                        "Using regular magic file `%s'" % _magic
                elif o in ('-k','--keep-going'):
                    pass
                elif o in ('-n','--flush'):
                    flush = 1
                elif o in ('-v','--version'):
                    print 'VERSION'
                    return 0
                elif o in ('-z','--compressed'):
                    pass
                elif o in ('-L','--follow'):
                    follow = 1
                elif o in ('-s','--special'):
                    special = 1
            else:
                if files :
                    files = map(lambda x: x.strip(), v.split(','))
                    if '-' in files and '-' in args :
                        error( 1, 'cannot use STDIN simultaneously for file list and data' )
                    for file in files :
                        for name in (
                                        (file=='-')
                                                and sys.stdin
                                                or open(file,'r',BUFFER_SIZE)
                        ).xreadlines():
                            name = name.strip()
                            if name not in args :
                                args.append( name )
        try:
            if check : print >>sys.stderr, 'Loading magic database...'
            t0 = time.time()
            m = MagicFile(_magic)
            t1 = time.time()
            if check :
                print >>sys.stderr, \
                                        m.total_tests, 'tests loaded', \
                                        'for', '%.2f' % (t1-t0), 'seconds'
                print >>sys.stderr, len(m.tests), 'tests at top level'
                return 0 # XXX "shortened" form ;-)


            mlen = max( map(len, args) )+1
            for arg in args :
                if not brief : print (arg + ':').ljust(mlen),
                ftype = get_file_type( arg, follow )
                if (special and ftype.find('special')>=0) \
                                or ftype[-4:] == 'file' :
                    t0 = time.time()
                    try:
                        t = m.detect( arg )
                    except (IOError,os.error), why:
                        t = "can't read `%s' (%s)" % (why.filename,why.strerror)
                    if ftype[-4:] == 'file' : t = ftype[:-4] + t
                    t1 = time.time()
                    print t and t or 'data'
                    if 0 : print \
                                                        '#\t%d tests ok, %d tests failed for %.2f seconds'%\

                                                        (m.ack_tests, m.nak_tests, t1-t0)
                else:
                    print mime and 'application/x-not-regular-file' or ftype
                if flush : sys.stdout.flush()
        # print >>sys.stderr, 'DONE'

        except:
            if check : return 1
            raise
        else:
            return 0
    finally:
        pass

if __name__ == '__main__' :
    sys.exit( main() )
# vim:ai

# EOF #


---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Reply via email to