http://www.mediawiki.org/wiki/Special:Code/MediaWiki/100158
Revision: 100158 Author: platonides Date: 2011-10-18 20:52:08 +0000 (Tue, 18 Oct 2011) Log Message: ----------- Import wfMsg statistics script by Seb35, per http://lists.wikimedia.org/pipermail/mediawiki-i18n/2011-September/000344.html thread. Source https://svn.toolserver.org/svnroot/seb35/mediawiki/read_wfMsgCalls.py r18 Added Paths: ----------- trunk/tools/code-utils/read_wfMsgCalls.py Added: trunk/tools/code-utils/read_wfMsgCalls.py =================================================================== --- trunk/tools/code-utils/read_wfMsgCalls.py (rev 0) +++ trunk/tools/code-utils/read_wfMsgCalls.py 2011-10-18 20:52:08 UTC (rev 100158) @@ -0,0 +1,279 @@ +# -*- coding: utf-8 -*- + +# +# Two small tools to analyse the keys of the i18n messages of MediaWiki +# 1/ Get the current keys (and corresponding language and message) from their definition +# 2/ Get the calls of the wfMsg functions (and corresponding file, line, type (wfMsg, wfMsgForContent, etc.), message key (when possible)) +# +# +# Some details: +# * this program calls a PHP interpreter, so you need it +# * the occurences of wfMsg functions in block comments are removed (about 30 occurences), I didn’t check for single-line comments +# * I assumed the key of the messages are '([a-zA-Z0-9_-]+?)', it seems quite correct after tests (anyway no wfMsg call is forgotten, even if the key is not recognized) +# * hence the key with a variable are never computed since you must have a deeper program analysis, the cases are wfMsg( $wgLogNames[$type] ) or (easier?) wfMessage( 'block-log-flags-' . $flag ) +# * some calls are missed when called by call_user_func_array, but in these case you probably have no chance to get the associated message key because it is probably a variable) +# * the results are CSV +# * the format of the messageStrings file is: 1/ language code (from the name of the file when available); 2/ message key; 3/ content of the message +# * the format of the wfMsgCalls file is: 1/ path of the file; 2/ line number; 3/ wfMsg type (wfMsg, wfMessage, etc.); 4/ message key (when possible); 5/ complete call of the function +# + + +# # # # # # # # +# Parameters # +# # # # # # # # + +# BASE PARAMETERS + +# Folder containing a tree of MediaWiki +baseFolder = "mediawiki/repo/phase3" + +# Name of the CSV result file (in the current folder) containing the calls to the functions wfMsg* (specified thereafter in a parameter) obtained by analysing the code +wfMsgCallsResultFile = "wfMsgCalls.csv" + +# Name of the CSV result file (in the current folder) containing the associations lang-msgkey-message by retrieving the content of the PHP $messages variable in the 'languages' and i18n folders +messageStringsResultFile = "messageStrings.csv" + +# Save also the content of the messages (count 10Mio without and 30Mio with) +lightMessageStrings = False + +# Name of the wfMsg functions to search in the code +messageFunctions = [ "wfMsg", "wfMessage", "wfMessageFallback", "wfMsgExt", "wfMsgForContent", "wfMsgNoTrans", "wfMsgForContentNoTrans", "wfMsgReal", "wfMsgHtml", "wfMsgWikiHtml", "wfEmptyMsg", "wfMsgReplaceArgs", "wfMsgGetKey" ] + + +# MESSAGES FOLDERS AND FILES + +# Folders (let the # to include messagesIndividualFiles) +messagesFolders = { 'phase3': [ 'languages/messages' ], 'extensions': [ ], '#':'#' } + +# Exclude these files +messagesExcludeFiles = [] + +# Include these files (must not be in the previous folders else it would be duplicated) +messagesIndividualFiles = [] + + +# CODE FOLDERS AND FILES + +# Folders (let the # to include codeIndividualFiles) +codeFolders = [ "includes", "extensions", "skins", "languages/classes", "#" ] + +# Exclude these files +codeExcludeFiles = [] + +# Include these files (must not be in the previous folders else it would be duplicated) +codeIndividualFiles = [ "languages/Language.php", "languages/LanguageConverter.php", "languages/Names.php", "resources/Resources.php" ] + + + + +# # # # # # # # # # # # +# Read the i18n files # +# # # # # # # # # # # # + +import os, os.path, re, csv, subprocess + + +currentFolder = os.getcwd() +os.chdir( baseFolder ) + +i18nMessages = [] + +# Iterate over folders and files +for messagesFolderType in messagesFolders: + + directories = [] + if messagesFolderType == 'extensions': + for directory in messagesFolders[messagesFolderType]: + l = os.walk( directory ) + for j in l: + if '.svn' in j[0]: + continue + directories.append( j[0] ) + messagesFolders[messagesFolderType] = directories + + for messagesFolder in messagesFolders[messagesFolderType]: + + if messagesFolderType != '#': + files = os.listdir( messagesFolder ) + else: + files = messagesIndividualFiles + messageFolder = '' + + for filename in files: + + if filename[-4:] != '.php': + continue + + if messagesFolderType == 'extensions' and filename[-9:] != '.i18n.php': + continue + + if filename in messagesExcludeFiles: + continue + + if messagesFolderType == 'phase3' and filename[:8] == 'Messages': + lang = filename[8:-4] + + # Read the PHP $messages variable + p = subprocess.Popen( 'php', shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, close_fds=True ) + print >>p.stdin, '<?php' + print >>p.stdin, "require( '"+os.path.join( 'includes', 'Defines.php' )+"' );" + print >>p.stdin, "require( '"+os.path.join( messagesFolder, filename )+"' );" + print >>p.stdin, """ + if( isset( $messages ) && is_array( $messages ) && count( $messages ) > 0 ) { + if( is_array( current( $messages ) ) ) { + foreach( $messages as $lang => $msgs ) + foreach( $msgs as $key => $msg ) + echo $lang.'|'.$key.' '.str_replace( array("\r\n", "\n", "\r"), "5197361546748612348916973", $msg )."\n"; + } + else { + foreach( $messages as $key => $msg ) + echo $key.' '.str_replace( array("\r\n", "\n", "\r"), "5197361546748612348916973", $msg )."\n"; + } + } + """ + p.stdin.close() + + messages = str.splitlines( p.stdout.read() ) + + if len(messages) == 0: + if messagesFolderType == 'phase3' and filename[:8] == 'Messages': + print 'Core language '+lang+' doesn’t have a $message variable in '+filename+' or is empty.' + else: + print 'File '+filename+' doesn’t have a $message variable or is empty.' + continue + + # Retrieve the result and put it in a list of list + for message in messages: + + msg = message.split( ' ', 1 ) + i18nMessage = [] + if '|' in msg[0]: + sp = msg[0].split( '|' ) + i18nMessage.append( sp[0] ) + i18nMessage.append( sp[1] ) + else: + i18nMessage.append( lang.lower() ) + i18nMessage.append( msg[0] ) + if not lightMessageStrings: + i18nMessage.append( msg[1].replace( '5197361546748612348916973', '\n' ) ) + i18nMessages.append( i18nMessage ) + +os.chdir( currentFolder ) +writer = csv.writer( open( messageStringsResultFile, 'w' ) ) +writer.writerows( i18nMessages ) + + +# # # # # # # # # # # # +# Read the code files # +# # # # # # # # # # # # + +os.chdir( baseFolder ) + +wfMsgCalls = [] + +msgFunctions = "(" + '|'.join( messageFunctions ) + ")( *\(.*?\))" +msgFunctionsSoft = "(" + '|'.join( messageFunctions ) + ")" +msgFunctionsMaxLength = max( [ len(f) for f in messageFunctions ] ) + +# Iterate over folders and files +for folder in codeFolders: + + if folder != '#': + directories = os.walk( folder ) + else: + directories = [ '.' ] + + for directory in directories: + + if folder != '#': + if '.svn' in directory[0]: + continue + fileset = directory[2] + direct = directory[0] + else: + fileset = codeIndividualFiles + direct = '' + + for filename in fileset: + + if filename[-4:] != ".php": + continue + + if filename in codeExcludeFiles: + continue + + fyle = open( os.path.join( direct, filename ), 'r' ) + + content = fyle.read() + + # Remove the false positive in block comments (some could remain if in single-line comments) + incomment = False + for c in range(len(content)-1): + if c == len(content): + break + if content[c] == '/' and content[c+1] == '*': + incomment = True + if content[c] == '*' and content[c+1] == '/': + incomment = False + if content[c] == 'w' and incomment: + f = re.search( '^'+msgFunctionsSoft, content[c:c+msgFunctionsMaxLength] ) + if f != None: + content = content[:c] + content[c+f.end():] + + # Get the indexes of the beginning of lines (to compute after the line number) + indexOfBeginningOfLines = [0] + for m in re.finditer( '(?:\n|\r|\n\r|\r\n)', content ): + indexOfBeginningOfLines.append( m.end() ) + + if indexOfBeginningOfLines[-1] != len(content): + indexOfBeginningOfLines.append( len(content) ) + + # Iterate to find the wfMsg functions + for m in re.finditer( msgFunctions, content, re.S ): + + i = -1 + while m.start()-indexOfBeginningOfLines[i] < 0: + i = i - 1 + + # Search the key once we recognized the message + key = '' + k = re.search( "^\(\s*'([a-zA-Z0-9_-]+?)'\s*(?:,|\))", m.group(2) ) + if k != None: + key = k.group(1) + else: + k = re.search( '^\(\s*"([a-zA-Z0-9_-]+?)"\s*(?:,|\))', m.group(2) ) + if k != None: + key = k.group(1) + + wfMsgCall = [ os.path.join( directory[0], filename ), len(indexOfBeginningOfLines)+i+1, m.group(1), key, m.group(0) ] + + # You must have the same number of opening and closing parenthesis + if m.group(0).count( '(' ) > 1: + + recursion = 0 + pos = m.end() + + while wfMsgCall[4].count( '(' ) != wfMsgCall[4].count( ')' ): + + endparenthesis = '' + for nbparenthesis in range( wfMsgCall[4].count( '(' ) - wfMsgCall[4].count( ')' ) ): + endparenthesis = endparenthesis + '.*?\)' + res = re.search( endparenthesis, content[pos:], re.S ) + + pos = pos + res.end() + + if res != None: + wfMsgCall[4] = wfMsgCall[4] + res.group(0) + else: + raise Exception( 'parenthesis expected' ) + recursion = recursion + 1 + + if recursion == 10: + raise Exception( 'recursion' ) + + wfMsgCalls.append( wfMsgCall ) + +os.chdir( currentFolder ) +writer = csv.writer( open( wfMsgCallsResultFile, 'w' ) ) +writer.writerows( wfMsgCalls ) + Property changes on: trunk/tools/code-utils/read_wfMsgCalls.py ___________________________________________________________________ Added: svn:eol-style + native _______________________________________________ MediaWiki-CVS mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs
