Here is the latest version of the Apache CLF web.log data mining script
which parses the .log files and dumps the data into .csv files for use in
Excel, etc.

The latest version filters out local IP addresses, localhost, and also hits
on .gif, .jpg, .swf, .css, file extensions. Of course, the script can be
customized easily.

REBOL [
   Title:   "Apache CLF web.log data mining script"
   Date:    26-March-2001
   Name:    'Log-Parser
   Version: 0.6.5
   File:    %log-parser.r
   Home:    http://www.dangeroustechnology.com
   Author:  "Ryan C. Christiansen"
   Email:   [EMAIL PROTECTED]
   Owner:   "Ryan Christiansen"
   Rights:  "Copyright (C) Ryan Christiansen 2001"
   Language: 'English
   Charset: 'ANSI
   Purpose: {
      Parse Apache CLF web.log files and dump data into .csv files.
   }

   Comment: {
      Script relies on the existence of %dns-library.r which it uses as a
dns-cache file. The first time you run the script, begin with a 0-byte
%dns-library.r file.
   }

   History: [
      0.6.5 [26-March-2001 "Added full header to script. Added choice for
parsing local or remote log. Added filter-local-IPs function. Added
filter-file-extensions function, including filter for cobalt server error
files and robots.txt files" "Ryan"]
   ]

   Example: {do %log-parser.r}
]

print "Do you wish to parse a (L)ocal or (R)emote file?"

choice: input

switch/default choice [
    "L" [
        log-file: read/lines %web.log
    ]
    "R" [
        log-file: read/lines
ftp://username:[EMAIL PROTECTED]/logs/web.log
    ]
][
    print "Invalid choice -- session ended"
    wait 4
    quit
]

retrieved-library: read %dns-library.r
dns-library: parse retrieved-library none

assemble-date: func [
    "Parse one line in CLF web.log format and return the date as a REBOL
date! datatype"
    log-line [string!] "One line in CLF web.log format"
    /local date-line date-string return-date
][
    date-line: parse log-line none
    date-line/4: remove date-line/4 {[}
    date-line/5: remove date-line/5 {]}
    date-string: rejoin [date-line/4 " " date-line/5]
    return-date: make date! date-string
]

assemble-time: func [
    "Parse one line in CLF web.log format and return the time as a REBOL
time! datatype"
    log-line [string!] "One line in CLF web.log format"
    /local date-line date-string return-time
][
    date-line: parse log-line none
    date-line/4: remove date-line/4 {[}
    date-line/5: remove date-line/5 {]}
    date-string: rejoin [date-line/4 " " date-line/5]
    parse date-string [thru ":" copy text to end (return-time: make time!
text)]
    return-time
]

dns-lookup: func [
    "Convert an IP address to a domain name"
    dns-cache "A cache of IP addresses and corresponding domain names"
    IP [string!] "The IP address that needs to be converted to a domain"
    /local domain
][
    domain: select dns-cache IP
    if ( domain == none ) [
        domain: read join dns:// IP
        if ( domain == none ) [
            domain: "unresolved"
        ]
        append/only dns-cache IP
        append/only dns-cache domain
    ]
    domain
]

filter-local-IPs: func [
    IP-to-check [string!] "The IP address as a string parsed from the CLF
log file entry"
    /local IP keeper
][
    IP: make tuple! IP-to-check
    keeper: true
    if all [(216.115.108.1 <= IP)(IP <= 216.115.108.254)][
        keeper: false keeper
    ] ; check local IPs
    if IP = 127.0.0.1 [keeper: false keeper] ; check localhost
    keeper
]

filter-file-extensions: func [
    file-to-filter [string!] "A string from the CLF web.log containing the
requested file during the hit"
    /local keeper
][
    keeper: true
    if find/any file-to-filter ".gif" [
        keeper: false keeper
    ]
    if find/any file-to-filter ".jpg" [
        keeper: false keeper
    ]
    if find/any file-to-filter ".jpeg" [
        keeper: false keeper
    ]
    if find/any file-to-filter ".bmp" [
        keeper: false keeper
    ]
    if find/any file-to-filter ".swf" [
        keeper: false keeper
    ]
    if find/any file-to-filter ".css" [
        keeper: false keeper
    ]

    ; -also filter cobalt error files

    if find/any file-to-filter "question_warning" [
        keeper: false keeper
    ]
    if find/any file-to-filter "lock_warning" [
        keeper: false keeper
    ]

    ; -also filter requests for robots.txt files

    if find/any file-to-filter "robots.txt" [
        keeper: false keeper
    ]

    keeper
]

parse-log-line: func [
    "Parse one line in CLF web.log format and return the IP address, hit
date, hit time, file hit, bytes used, referring page, and browser type"
    log-line [string!] "One line in CLF web.log format"
    file-to-save [file!] "The name of the target file to write returned
variables"
    /local current-line
][
    current-line: parse log-line none
    IP-address: make string! current-line/1
    keep-hit: filter-local-IPs IP-address
    either keep-hit = true [
        domain-address: dns-lookup dns-library IP-address
        hit-date: assemble-date log-line
        hit-time: assemble-time log-line
        hit-file: current-line/6
        keep-hit-two: filter-file-extensions hit-file
        either keep-hit-two = true [
            hit-bytes: current-line/8
            referring-page: current-line/9
            browser-type: current-line/10
            write/append csv-file-name (rejoin [IP-address ","
domain-address "," hit-date "," hit-time "," hit-file "," hit-bytes ","
referring-page "," browser-type newline])
        ][
            ignore: copy [] ; do nothing
        ]
    ][
        ignore: copy [] ; do nothing
    ]
]



checksum-date: assemble-date log-file/1
csv-file-name: make file! (rejoin [checksum-date ".csv"])
log-directory: read %.
either find/any log-directory csv-file-name [
    foreach log-line log-file [
        current-line: parse log-line none
        hit-date: assemble-date log-line
        either not-equal? hit-date checksum-date [
            csv-file-name: make file! (rejoin [hit-date ".csv"])
            write csv-file-name {User IP Address, User Domain Address, Date
Hit, Time Hit, File Hit, Bytes Transferred, Referring Page, Browser Type}
            write/append csv-file-name (newline newline)
            if error? try [parse-log-line log-line csv-file-name][next
log-file]
            checksum-date: hit-date
        ][
            if error? try [parse-log-line log-line csv-file-name][next
log-file]
            next log-file
        ]
    ]

][
    write csv-file-name {User IP Address, User Domain Address, Date Hit,
Time Hit, File Hit, Bytes Transferred, Referring Page, Browser Type}
    write/append csv-file-name (newline newline)
    foreach log-line log-file [
        current-line: parse log-line none
        hit-date: assemble-date log-line
        either not-equal? hit-date checksum-date [
            csv-file-name: make file! (rejoin [hit-date ".csv"])
            write csv-file-name {User IP Address,  User Domain Address,
Date Hit, Time Hit, File Hit, Bytes Transferred, Referring Page, Browser
Type}
            write/append csv-file-name (newline newline)
            if error? try [parse-log-line log-line csv-file-name][next
log-file]
            checksum-date: hit-date
        ][
            if error? try [parse-log-line log-line csv-file-name][next
log-file]
            next log-file
        ]
    ]
]

write %dns-library.r ""
foreach library-entry dns-library [
    write/append %dns-library.r library-entry
    write/append %dns-library.r " "
]

-- 
To unsubscribe from this list, please send an email to
[EMAIL PROTECTED] with "unsubscribe" in the 
subject, without the quotes.

Reply via email to