Hi,
I am looking for a python based metaprogramming library that will
allow me to tune programs using python and then export the optimal
cuda (for now) code to a file so that it can be used from a c++
application.
1) What is the best choice for this?
The metaprogramming systems associated with pycuda are templating
based systems or codepy. Codepy seemed to be recommended by the
documentation.
However the problem that I encountered was that the codepy
representation of code was only marginally understandable. Because I
wasn't too keen on converting a bigger kernel into this format I had a
look around. I noticed that the codepy classes were very similar to
the python AST, for obvious reasons.
Based on this I wrote an (incomplete) AST transformer that converted a
python implementation of the codepy demo program into the codepy
representation.
This is more or less easy enough for the simple program provided.
However the question is if this is still a viable way to do things
when kernels get more complicated. The python AST transformer will
allow one to do pretty much every possible code transformation, in the
end it probably comes down not if it will it be possible to add all of
python to c translation hints/options e.g. loop unrolling ect. in a
way that keeps the code legal python and readable.
At the moment I think this is feasible since pretty much everything
can be wrapped in (or preceded) by a (empty) function who's name can
be used to influence AST transformer behavior.
Next I will try to implement a 2D convolution.
2) Is there a frontend script for codepy that compiles and evaluates
the kernel performance?
At the moment this seems like the most pythonic way to do
metaprogramming to me in terms of readability and flexibility, though
not in ease of implementation.
3) In general does this approach seem sane to you guys, and if does
what should it look like in order to be useful to others too?
BR, Max
import ast,inspect
from functools import wraps
from copy import deepcopy
import numpy
import codepy
from codepy.cgen import FunctionBody, FunctionDeclaration,Typedef,POD,Value,\
Pointer, Module, Block, Initializer, Assign
from cgen.cuda import CudaGlobal
import pdb
'''
All optimizes variables are uint32_t for now
'''
opt_var_type = numpy.int32
#---------------------------------------------------------------------
#Decorator Functions
#just uses their names and arguments for information when translating
def accepts(*args,**kwargs):
def dec(f):
return f
return dec
def optimizes(*args,**kwargs):
def dec(f):
return f
return dec
def returns(*args,**kwargs):
def dec(f):
return f
return dec
def __global__(f):
return f
#-----------------------------------------------------------------------
#Class Defn
#again just use their names
class OptimizationQuantity:
def __init__(self,default,min,max,step):
self.default = default
self.min = min
self.max = max
self.step = step
#-----------------------------------------------------------------------
#Variable Information class
class VarInfo:
#All the declared variables should go in here, together with their type
types = {}
#This tells us if the variable is defined during runtime or code generation
#called gentime
modes = {}
#An add function that does not change self
def __add__(self,other):
result = VarInfo()
result.types = dict(list(self.types.items()) + list(other.types.items()))
result.modes = dict(list(self.modes.items()) + list(other.modes.items()))
return result
#Store the types of CUDA's variables here
_var_info = VarInfo()
_var_info.types["threadIdx"] = numpy.int32
_var_info.types["blockIdx"] = numpy.int32
_var_info.modes["threadIdx"] = "runtime"
_var_info.modes["blockIdx"] = "runtime"
#This is extended to functions. give mode as gentime if the function can be
#evaluated then under the condition that all its arguments are also "gentime"
#constant
_var_info.types["range"] = numpy.int32
_var_info.modes["range"] = "gentime"
#------------------------------------------------------------------------
#Renamed range function
#This tells the generator to unroll the loop
#this could happen automatically but its more understandable for now
def range_unroll(*args,**kwargs):
return range(*args,**kwargs)
'''
This turns an expression with mixed mode into a codepy string.
In these strings the gentime stuff is exposed as variables, and the runtime stuff as strings.
'''
class VisitorBase():
#----------------------------------------------------------
#helper functions
#these extreact type and mode information from ast nodes
def getType(self,node):
node_copy = deepcopy(node)
typer = typeTransformer(self.var_info)
return typer.visit(node_copy)
def getMode(self,node):
node_copy = deepcopy(node)
moder = modeTransformer(self.var_info)
return moder.visit(node_copy)
def getExpr(self,node):
node_copy = deepcopy(node)
exper = exprTransformer(self.var_info,self.opt_param)
str_rep = exper.visit(node_copy)
return str_rep
def getOptRepl(self,node):
node_copy = deepcopy(node)
replr = optReplacer(self.var_info,self.opt_param)
str_rep = replr.visit(node_copy)
return str_rep
def allArgs(self,node):
if isinstance(node, ast.FunctionDef):
allargs = node.args.args
if(node.args.kwarg):
allargs += node.args.kwarg
return allargs
allargs = node.args
if (node.kwargs):
allargs += node.kwargs
return allargs
'''
This transformer takes the node of an expression and tries to find its
type, its a bit of a hacky implementation beacuse we are replacing the
ast classes with the class of the type, however since only we touch it
it seems to be fine. It would be better to the ast instances with another
valid ast instance that holds only the type information.
'''
class typeTransformer(ast.NodeTransformer):
def __init__(self,info):
self.var_info = info
def getBinOpType(self,op,left,right):
if(left == numpy.int32 and right == numpy.int32):
return numpy.int32
else:
raise NotImplementedError
def visit_Name(self,node):
if node.id in self.var_info.types:
return self.var_info.types[node.id]
else:
raise NameError(node.id,self.var_info.types.keys())
def visit_Attribute(self,node):
if node.value.id in self.var_info.types:
return self.var_info.types[node.value.id]
else:
raise NotImplementedError
def visit_BinOp(self,node):
return self.getBinOpType(node.op,self.visit(node.left),self.visit(node.right))
def visit_Call(self,node):
if(node.func.id in self.var_info.types):
return self.var_info.types[node.func.id]
else:
raise NotImplementedError
'''
Like typeTransformer but tells you the mode of an expression
'''
class modeTransformer(ast.NodeTransformer,VisitorBase):
def __init__(self,info):
self.var_info = info
def getBinOpMode(self,op,left,right):
if(left == "gentime" and right == "gentime"):
return "gentime"
else:
return "runtime"
def visit_Name(self,node):
if node.id in self.var_info.modes:
return self.var_info.modes[node.id]
else:
print(node.id,self.var_info.types)
raise NotImplementedError
def visit_Call(self,node):
if(node.func.id in self.var_info.modes):
if( all([ self.visit(m) == "gentime" for m in self.allArgs(node)])):
return "gentime"
else:
return "runtime"
else:
raise NotImplementedError
def visit_Attribute(self,node):
if node.value.id in self.var_info.modes:
return self.var_info.modes[node.value.id]
else:
raise NotImplementedError
def visit_BinOp(self,node):
return self.getBinOpMode(node.op,self.visit(node.left),self.visit(node.right))
'''
Compile a ast node to codepy compatible strings(c++)
This is like codegen for c++
'''
class exprTransformer(ast.NodeTransformer,VisitorBase):
def __init__(self,info,opt_param):
self.var_info = info
self.opt_param = opt_param
def getBinOpExpr(self,op,left,right):
if(isinstance(op,ast.Mult)):
return left + " * " + right
elif(isinstance(op,ast.Add)):
return left + " + " + right
else:
raise NotImplementedError(op,left,right)
def visit_Name(self,node):
if node.id in self.opt_param:
return str(self.opt_param[node.id])
elif node.id in self.var_info.types:
return node.id
else:
raise NameError(node.id)
def visit_Subscript(self,node):
if node.value.id in self.var_info.types:
return node.value.id + "["+self.visit(node.slice.value)+"]"
else:
raise NameError(node.value.id)
def visit_Attribute(self,node):
if node.value.id in self.var_info.types:
return node.value.id + "." + node.attr
else:
raise NameError(node.value.id)
def visit_BinOp(self,node):
return self.getBinOpExpr(node.op,self.visit(node.left),self.visit(node.right))
'''
Replace the optimization paramter names in the ast tree with their
values. This uses proper ast types instead of hacky sring replacement.
'''
class optReplacer(ast.NodeTransformer,VisitorBase):
def __init__(self,info,opt_param):
self.var_info = info
self.opt_param = opt_param
def visit_Name(self,node):
if node.id in self.opt_param:
return ast.Num(self.opt_param[node.id],
lineno=node.lineno,
col_offset=node.col_offset)
elif node.id in self.var_info.types:
#dont change anything
return node
else:
raise NameError(node.id)
class codepyVisitor(ast.NodeTransformer,VisitorBase):
def __init__(self,**opt_param):
self.var_info = _var_info
self.scope = []
self.opt_param = opt_param
#---------------------------------------------------------
#visitor functions
def generic_visit(self,node):
print("Visit: ",type(node).__name__)
ast.NodeVisitor.generic_visit(self,node)
def visit_Name(self,node):
return node
def visit_Module(self,node):
new_module_body = [self.visit(e) for e in node.body]
return Module(new_module_body)
def visit_FunctionDef(self,node):
#Get the function name
new_func_name = node.name
#figure out the return type
new_func_ret = "void" #default
if( isinstance(node.returns, list)):
assert(len(node.returns)==1)
new_fun_ret = node.returns[0]
raise NotImplementedError
#First we have to collect information about the optimization
#variables from the decorators
#by default
new_func_type = "host"
for decorator in node.decorator_list:
if(isinstance(decorator,ast.Call)):
decorator_name = decorator.func.id
if(decorator_name == "optimizes"):
if decorator.args:
for arg in decorator.args:
self.var_info.types[arg.id] = opt_var_type
self.var_info.modes[arg.id] = "gentime"
if(arg.id not in self.opt_param):
raise NameError("Optimization parameter: "+arg.id+"not specificed")
if decorator.kwargs:
for kwarg in decorator.kwargs:
raise NotImplementedError
elif(isinstance(decorator,ast.Name)):
decorator_name = decorator.id
if(decorator_name == "__global__"):
new_func_type = "global"
else:
raise NotImplementedError
#Then we collect the function arguments and their anotations
new_func_args = []
for arg in self.allArgs(node):
arg_name = arg.arg
if(isinstance(arg.annotation,ast.Call) and arg.annotation.func.id == "Pointer"):
assert(len(arg.annotation.args)==1)
assert(isinstance(arg.annotation.args[0],ast.Attribute))
arg_type = eval(compile(ast.Expression(arg.annotation.args[0]),'','eval',))
new_func_args.append(Pointer(POD(arg_type,arg_name)))
else:
raise NotImplementedError(arg_name,arg)
self.var_info.types[arg_name] = arg_type
self.var_info.modes[arg_name] = "runtime"
#After we have collected the type info walk the tree
#We get either a list of cgen object of a cgen object
new_func_body = []
for e in node.body:
res = self.visit(e)
if(isinstance(res,codepy.cgen.Generable)):
new_func_body.append(res)
elif(isinstance(res,list)):
new_func_body.extend(res)
else:
raise ValueError
#Now we have done all of the hard work, just put in our new_func_*
new_func_decl = FunctionDeclaration( Value(new_func_ret,new_func_name),
arg_decls=new_func_args
)
if(new_func_type == "global"):
new_func_decl = CudaGlobal(new_func_decl)
return FunctionBody(new_func_decl,Block(new_func_body))
def visit_Assign(self,node):
for target in node.targets:
if(isinstance(target,ast.Name)):
if target.id in self.scope:
raise NotImplementedError
else:
#then create an initializer
new_var_name = target.id
new_var_type = self.getType(node.value)
new_var_value = self.getExpr(node.value)
#update the internal variable information
self.var_info.types[new_var_name] = new_var_type
self.var_info.modes[new_var_name] = "runtime"
return Initializer(POD(new_var_type,new_var_name),new_var_value)
elif(isinstance(target,ast.Subscript)):
#Must be an assign
#convert the slice expression
index_mode = self.getMode(target.slice.value)
index_type = self.getType(target.slice.value)
assert(index_type == numpy.int32)
return Assign(
target.value.id+"[ "+self.getExpr(target.slice.value)+" ]",
self.getExpr(node.value))
def visit_For(self,node):
#A for loop defines a variable based on an iterator
#first we check what the iterator is to see if the variable is
#a runtime or generation time constant
iter_var_name = node.target.id
iter_var_type = self.getType(node.iter)
iter_var_mode = self.getMode(node.iter)
#Then add the loop variable to our collection
self.var_info.types[iter_var_name] = iter_var_type
self.var_info.modes[iter_var_name] = iter_var_mode
if(iter_var_mode == "gentime"):
#then fun-nroll the loop
iter_ast = self.getOptRepl(node.iter)
iter_obj = compile(ast.Expression(iter_ast),'','eval')
trl_body = []
for i in eval(iter_obj):
self.opt_param[iter_var_name] = i
#After we have collected the type info walk the tree
trl_body.extend( [self.visit(e) for e in deepcopy(node.body)] )
del self.opt_param[iter_var_name]
return trl_body
else:
raise NotImplementedError
#the iter variable goes out of scope now
self.var_info.types[iter_var_name] = iter_var_type
self.var_info.modes[iter_var_name] = iter_var_mode
def py2codepy(function, **opt_param):
#first get the functions ast
func_ast = ast.parse(inspect.getsource(function),'eval')
#instantiate ast transformer
visitor = codepyVisitor(**opt_param)
#apply transformation
return visitor.visit(func_ast)
#--------------------------------------------------------
#--------------------------------------------------------
#--------------------------------------------------------
block_size = OptimizationQuantity(256,min=1,max=200,step=10)
thread_strides = OptimizationQuantity(16,min=1,max=20,step=1)
@optimizes(block_size, thread_strides)
@__global__
def add(tgt:Pointer(numpy.float32), op1:Pointer(numpy.float32), op2:Pointer(numpy.float32)):
idx = threadIdx.x + thread_strides * block_size * blockIdx.x
for i in range(block_size):
tgt[idx+ i*thread_strides] = op1[idx+i*thread_strides] + op2[idx + i*thread_strides]
print( py2codepy(add,block_size=256,thread_strides=16) )
_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda