Source code for brian2cuda.cuda_generator

import itertools

import numpy as np
import re

from brian2.utils.stringtools import (deindent, stripped_deindented_lines,
                                      word_substitute)
from brian2.utils.logger import get_logger
from brian2.utils.stringtools import get_identifiers
from brian2.parsing.rendering import CPPNodeRenderer
from brian2.core.functions import Function, DEFAULT_FUNCTIONS
from brian2.core.clocks import Clock
from brian2.core.preferences import prefs
from brian2.core.variables import ArrayVariable
from brian2.codegen.generators.cpp_generator import c_data_type
from brian2.codegen.generators.base import CodeGenerator
from brian2.devices import get_device
from brian2cuda.utils.gputools import get_cuda_runtime_version


__all__ = ['CUDACodeGenerator', 'CUDAAtomicsCodeGenerator', 'c_data_type']


logger = get_logger('brian2.codegen.generators.cuda_generator')


[docs]class ParallelisationError(Exception): pass
# This is a function since this code needs to be generated after user preferences are # set (for GPU selection) def _generate_atomic_support_code(): # (arg_dtype, int_dtype, name in type cast function) overloads = [('int', 'int', 'int'), ('float', 'int', 'int'), ('double', 'unsigned long long int', 'longlong')] cuda_runtime_version = get_cuda_runtime_version() # Note: There are atomic functions that are supported only for compute capability >= # 3.5. We don't check for those as we require at least 3.5. If we ever support # smaller CC, we need to adapt the code here (see Atomic Functions in CUDA # Programming Guide) device = get_device() assert device.minimal_compute_capability >= 3.5, "Need to adapt atomic support code" software_implementation_float_dtype = ''' // software implementation {int_dtype}* address_as_int = ({int_dtype}*)address; {int_dtype} old = *address_as_int, assumed; do {{ assumed = old; old = atomicCAS(address_as_int, assumed, __{arg_dtype}_as_{val_type_cast}(val {op} __{val_type_cast}_as_{arg_dtype}(assumed))); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) }} while (assumed != old); return __{val_type_cast}_as_{arg_dtype}(old); ''' hardware_implementation = ''' // hardware implementation return atomic{op_name}(address, val); ''' atomic_support_code = '' for op_name, op in [('Add', '+'), ('Mul', '*'), ('Div', '/')]: for arg_dtype, int_dtype, val_type_cast in overloads: # atomicAdd had hardware implementations, depending on dtype, compute # capability and CUDA runtime version. This section uses them when possible. if op_name == 'Add': format_sw = True code = ''' inline __device__ {arg_dtype} _brian_atomic{op_name}({arg_dtype}* address, {arg_dtype} val) {{ ''' if arg_dtype in ['int', 'float']: code += f''' {hardware_implementation} ''' elif arg_dtype == 'double': if cuda_runtime_version >= 8.0: # Check for CC in at runtime to use software or hardware # implementation. # Don't need to check for defined __CUDA__ARCH__, it is always defined # in __device__ and __global__ functions (no host code path). code += ''' #if (__CUDA_ARCH__ >= 600) {hardware_implementation} #else {software_implementation} #endif '''.format( hardware_implementation=hardware_implementation, software_implementation=software_implementation_float_dtype ) else: # For runtime < 8.0, there is no atomicAdd hardware implementation # support independent of CC. code += ''' {software_implementation} '''.format( software_implementation=software_implementation_float_dtype ) format_sw = True code += ''' }} ''' if format_sw: code = code.format( arg_dtype=arg_dtype, int_dtype=int_dtype, val_type_cast=val_type_cast, op_name=op_name, op=op ) else: code = code.format(arg_dtype=arg_dtype, op_name=op_name) atomic_support_code += code # For other atomic operations on int types, we can use the same template. elif arg_dtype == 'int': atomic_support_code += ''' inline __device__ int _brian_atomic{op_name}(int* address, int val) {{ // software implementation int old = *address, assumed; do {{ assumed = old; old = atomicCAS(address, assumed, val {op} assumed); }} while (assumed != old); return old; }} '''.format(op_name=op_name, op=op) # Else (not atomicAdd and not int types) use this template. else: # in the software implementation, we treat all data as integer types # (since atomicCAS is only define # and use atomicCAS to swap the memory with our our desired value code = ''' inline __device__ {{arg_dtype}} _brian_atomic{{op_name}}({{arg_dtype}}* address, {{arg_dtype}} val) {{{{ {software_implementation} }}}} '''.format(software_implementation=software_implementation_float_dtype) # above, just add the software_implementation code, below add the other # format variables (also present in software_implementation) code = code.format( arg_dtype=arg_dtype, int_dtype=int_dtype, val_type_cast=val_type_cast, op_name=op_name, op=op, ) atomic_support_code += code return atomic_support_code # CUDA does not support modulo arithmetics for long double. Since we can't give a warning, we let the # compilation fail, which gives an error message of type # error: more than one instance of overloaded function "_brian_mod" matches the argument list: ... # TODO: can we produce a more informative error message? _typestrs = ['int', 'long', 'long long', 'float', 'double']#, 'long double'] _hightype_support_code = 'template < typename T1, typename T2 > struct _higher_type;\n' for ix, xtype in enumerate(_typestrs): for iy, ytype in enumerate(_typestrs): hightype = _typestrs[max(ix, iy)] _hightype_support_code += ''' template < > struct _higher_type<{xtype},{ytype}> {{ typedef {hightype} type; }}; '''.format(hightype=hightype, xtype=xtype, ytype=ytype) _mod_support_code = ''' template < typename T1, typename T2 > __host__ __device__ static inline typename _higher_type<T1,T2>::type _brian_mod(T1 x, T2 y) {{ return x-y*floor(1.0*x/y); }} ''' _floordiv_support_code = ''' template < typename T1, typename T2 > __host__ __device__ static inline typename _higher_type<T1,T2>::type _brian_floordiv(T1 x, T2 y) {{ return floor(1.0*x/y); }} ''' _pow_support_code = ''' #ifdef _MSC_VER #define _brian_pow(x, y) (pow((double)(x), (y))) #else #define _brian_pow(x, y) (pow((x), (y))) #endif '''
[docs]class CUDACodeGenerator(CodeGenerator): ''' C++ language with CUDA library CUDA code templates should provide Jinja2 macros with the following names: ``main`` The main loop. ``support_code`` The support code (function definitions, etc.), compiled in a separate file. For user-defined functions, there are two keys to provide: ``support_code`` The function definition which will be added to the support code. ``hashdefine_code`` The ``#define`` code added to the main loop. See `TimedArray` for an example of these keys. ''' class_name = 'cuda' _use_atomics = False # This will be generated when the first class instance calls `determine_keywords()` universal_support_code = None def __init__(self, *args, **kwds): super(CUDACodeGenerator, self).__init__(*args, **kwds) self.c_data_type = c_data_type self.warned_integral_convertion = False self.previous_convertion_pref = None self.uses_atomics = False # Set convertion types for standard C99 functions in device code # These are used in _add_user_function to format the function code if prefs.devices.cuda_standalone.default_functions_integral_convertion == np.float64: self.default_func_type = 'double' self.other_func_type = 'float' else: # np.float32 self.default_func_type = 'float' self.other_func_type = 'double' # set clip function to either use all float or all double arguments # see #51 for details if prefs['core.default_float_dtype'] == np.float64: self.float_dtype = 'double' else: # np.float32 self.float_dtype = 'float' @property def restrict(self): return prefs['codegen.generators.cpp.restrict_keyword'] + ' ' @property def flush_denormals(self): return prefs['codegen.generators.cpp.flush_denormals']
[docs] @staticmethod def get_array_name(var, access_data=True, prefix=None): ''' Return a globally unique name for `var`. See CUDAStandaloneDevice.get_array_name for parameters. Here, `prefix` defaults to `'_ptr'` when `access_data=True`. `prefix='_ptr'` is used since the CUDACodeGenerator generates the `scalar_code` and `vector_code` snippets. ''' # We have to do the import here to avoid circular import dependencies. from brian2.devices.device import get_device device = get_device() if access_data: if prefix is None: prefix = '_ptr' return device.get_array_name(var, access_data=True, prefix=prefix) else: return device.get_array_name(var, access_data=False, prefix=prefix)
[docs] def translate_expression(self, expr): expr = word_substitute(expr, self.func_name_replacements) return CPPNodeRenderer(auto_vectorise=self.auto_vectorise).render_expr(expr).strip()
[docs] def translate_statement(self, statement): var, op, expr, comment = (statement.var, statement.op, statement.expr, statement.comment) # For C++ we replace complex expressions involving boolean variables into a sequence of # if/then expressions with simpler expressions. This is provided by the optimise_statements # function. if statement.used_boolean_variables is not None and len(statement.used_boolean_variables): used_boolvars = statement.used_boolean_variables bool_simp = statement.boolean_simplified_expressions if op == ':=': # we have to declare the variable outside the if/then statement (which # unfortunately means we can't make it const but the optimisation is worth # it anyway). codelines = [self.c_data_type(statement.dtype) + ' ' + var + ';'] op = '=' else: codelines = [] firstline = True # bool assigns is a sequence of (var, value) pairs giving the conditions under # which the simplified expression simp_expr holds for bool_assigns, simp_expr in bool_simp.items(): # generate a boolean expression like ``var1 && var2 && !var3`` atomics = [] for boolvar, boolval in bool_assigns: if boolval: atomics.append(boolvar) else: atomics.append('!'+boolvar) if firstline: line = '' else: line = 'else ' # only need another if statement when we have more than one boolean variables if firstline or len(used_boolvars)>1: line += 'if('+(' && '.join(atomics))+')' line += '\n ' line += var + ' ' + op + ' ' + self.translate_expression(simp_expr) + ';' codelines.append(line) firstline = False code = '\n'.join(codelines) else: if op == ':=': decl = self.c_data_type(statement.dtype) + ' ' op = '=' if statement.constant: decl = 'const ' + decl else: decl = '' code = decl + var + ' ' + op + ' ' + self.translate_expression(expr) + ';' if len(comment): code += ' // ' + comment return code
[docs] def translate_to_read_arrays(self, read, write, indices): lines = [] # index and read arrays (index arrays first) for varname in itertools.chain(sorted(indices), sorted(read)): index_var = self.variable_indices[varname] var = self.variables[varname] if varname not in write: line = 'const ' else: line = '' line = line + self.c_data_type(var.dtype) + ' ' + varname + ' = ' line = line + self.get_array_name(var) + '[' + index_var + '];' lines.append(line) return lines
[docs] def translate_to_declarations(self, read, write, indices): lines = [] # simply declare variables that will be written but not read for varname in sorted(write): if varname not in read and varname not in indices: var = self.variables[varname] line = self.c_data_type(var.dtype) + ' ' + varname + ';' lines.append(line) return lines
[docs] def conditional_write(self, line, statement, conditional_write_vars): lines = [] if statement.var in conditional_write_vars: subs = {} condvar = conditional_write_vars[statement.var] lines.append(f'if({condvar})') lines.append(' ' + line) else: lines.append(line) return lines
[docs] def translate_to_statements(self, statements, conditional_write_vars): lines = [] # the actual code for stmt in statements: line = self.translate_statement(stmt) lines.extend(self.conditional_write(line, stmt, conditional_write_vars)) return lines
[docs] def translate_to_write_arrays(self, write): lines = [] # write arrays for varname in sorted(write): index_var = self.variable_indices[varname] var = self.variables[varname] line = self.get_array_name(var) + '[' + index_var + '] = ' + varname + ';' lines.append(line) return lines
[docs] def translate_one_statement_sequence(self, statements, scalar=False): # This function is refactored into four functions which perform the # four necessary operations. It's done like this so that code # deriving from this class can overwrite specific parts. all_unique = not self.has_repeated_indices(statements) read, write, indices, conditional_write_vars = self.arrays_helper(statements) try: # try to use atomics if not self._use_atomics or scalar or all_unique: raise ParallelisationError() # more complex translations to deal with repeated indices, which # could lead to race conditions when applied in parallel lines = self.parallelise_code(statements) self.uses_atomics = True except ParallelisationError: # don't use atomics lines = [] # index and read arrays (index arrays first) lines += self.translate_to_read_arrays(read, write, indices) # simply declare variables that will be written but not read lines += self.translate_to_declarations(read, write, indices) # the actual code lines += self.translate_to_statements(statements, conditional_write_vars) # write arrays lines += self.translate_to_write_arrays(write) code = '\n'.join(lines) # Check if 64bit integer types occur in the same line as a default function. # We can't get the arguments of the function call directly with regex due to # possibly nested paranthesis inside function paranthesis. statement_lines = self.translate_to_statements(statements, conditional_write_vars) convertion_pref = prefs.devices.cuda_standalone.default_functions_integral_convertion # only check if there was no warning yet or if convertion preference has changed if not self.warned_integral_convertion or self.previous_convertion_pref != convertion_pref: for line in statement_lines: brian_funcs = re.search('_brian_(' + '|'.join(functions_C99) + ')', line) if brian_funcs is not None: for identifier in get_identifiers(line): if convertion_pref == np.float64: # 64bit integer to floating-point conversions are not type safe int64_type = re.search(r'\bu?int64_t\s*{}\b'.format(identifier), code) if int64_type is not None: logger.warn("Detected code statement with default function and 64bit integer type in the same line. " "Using 64bit integer types as default function arguments is not type safe due to convertion of " "integer to 64bit floating-point types in device code. (relevant functions: {})\nDetected code " "statement:\n\t{}\nGenerated from abstract code statements:\n\t{}\n".format( ', '.join(functions_C99), line, statements ), once=True) self.warned_integral_convertion = True self.previous_convertion_pref = np.float64 else: # convertion_pref = np.float32 # 32bit and 64bit integer to floating-point conversions are not type safe int32_64_type = re.search(r'\bu?int(32|64)_t\s*{}\b'.format(identifier), code) if int32_64_type is not None: logger.warn("Detected code statement with default function and 32bit or 64bit integer type in the same line and the " "preference for default_functions_integral_convertion is 'float32'. " "Using 32bit or 64bit integer types as default function arguments is not type safe due to convertion of " "integer to single-precision floating-point types in device code. (relevant functions: {})\nDetected code " "statement:\n\t{}\nGenerated from abstract code statements:\n\t{}\n".format( ', '.join(functions_C99), line, statements ), once=True) self.warned_integral_convertion = True self.previous_convertion_pref = np.float32 return stripped_deindented_lines(code)
[docs] def atomics_parallelisation(self, statement, conditional_write_vars, used_variables): if not self._use_atomics: raise ParallelisationError() # Avoids circular import from brian2.devices.device import device # See https://github.com/brian-team/brian2/pull/531 for explanation used = set(get_identifiers(statement.expr)) used = used.intersection(k for k in self.variables.keys() if k in self.variable_indices and self.variable_indices[k] != '_idx') used_variables.update(used) if statement.var in used_variables: raise ParallelisationError() expr = self.translate_expression(statement.expr) if statement.op == ':=' or self.variable_indices[statement.var] == '_idx' or not statement.inplace: if statement.op == ':=': decl = self.c_data_type(statement.dtype) + ' ' op = '=' if statement.constant: decl = 'const ' + decl else: decl = '' op = statement.op line = f'{decl}{statement.var} {op} {expr};' line = [line] elif statement.inplace: sign = '' if statement.op == '+=': atomic_op = '_brian_atomicAdd' elif statement.op == '-=': # CUDA has hardware implementations for float (and for CC>=6.0 # for double) only for atomicAdd, which is faster then our # software implementation atomic_op = '_brian_atomicAdd' sign = '-' elif statement.op == '*=': atomic_op = '_brian_atomicMul' elif statement.op == '/=': atomic_op = '_brian_atomicDiv' else: # TODO: what other inplace operations are possible? Can we # implement them with atomicCAS ? logger.info("Atomic operation for operation {op} is not implemented." "".format(op=statement.op)) raise ParallelisationError() line = '{atomic_op}(&{array_name}[{idx}], ({array_dtype}){sign}({expr}));'.format( atomic_op=atomic_op, array_name=self.get_array_name(self.variables[statement.var]), idx=self.variable_indices[statement.var], array_dtype=c_data_type(self.variables[statement.var].dtype), sign=sign, expr=expr) # this is now a list of 1 or 2 lines (potentially with if(...)) line = self.conditional_write(line, statement, conditional_write_vars) else: raise ParallelisationError() if len(statement.comment): line[-1] += ' // ' + statement.comment return line
[docs] def parallelise_code(self, statements): try: used_variables = set() all_read, all_write, all_indices, _ = self.arrays_helper(statements) lines = [] # we are collecting all reads, which are not only loaded because # of in-place atomic operations and all writes, which are not # written to by atomics collected_reads = set() collected_writes = set() atomic_lines = [] for stmt in statements: lines.append(f'// Abstract code: {stmt.var} {stmt.op} {stmt.expr}') # We treat every statement individually with its own read and write code # to be on the safe side read, write, indices, conditional_write_vars = self.arrays_helper([stmt]) # No need to load a variable if it is only in read because of # the in-place operation, but still load the index var, we # need it for the atomics if (stmt.inplace and self.variable_indices[stmt.var] != '_idx' and stmt.var not in get_identifiers(stmt.expr)): read = read - {stmt.var} collected_reads = collected_reads.union(read) atomic_lines.extend(self.atomics_parallelisation(stmt, conditional_write_vars, used_variables)) # Do not write back such values, the atomic functions have # modified the underlying array already if stmt.inplace and self.variable_indices[stmt.var] != '_idx': write = write - {stmt.var} collected_writes = collected_writes.union(write) # we translate to read arrays altogether, otherwise we end up with # multiple declarations for statement identifiers # pass all_write here, since this determines the `const` keyword # (otherwise the variables written to by atomics would be `const`) # and all_indices, since we need those for the atomic calls lines.extend(self.translate_to_read_arrays(collected_reads, all_write, all_indices)) # we need to declare variables which will be written to but not # read (e.g. in assigmenets like `v_post = 1`) and which are not # used in atomics (hence we use `collected_writes`) lines.extend(self.translate_to_declarations(all_read, collected_writes, all_indices)) # add the atomic operations and statements lines.extend(atomic_lines) # only write variables which are not written to by atomics lines.extend(self.translate_to_write_arrays(collected_writes)) except ParallelisationError: # logg info here, since this means we tried, but failed to use atomics logger.info("Failed to parallelise code by using atomic operations. " "Falling back to serialized effect application. This " "might be slow. Switching to `cpp_standalone` might be " "faster. Code object name is {} and first line in " "abstract code is: {}".format(self.name, statements[0]), once=True) raise return lines
[docs] def denormals_to_zero_code(self): if self.flush_denormals: return ''' #define CSR_FLUSH_TO_ZERO (1 << 15) unsigned csr = __builtin_ia32_stmxcsr(); csr |= CSR_FLUSH_TO_ZERO; __builtin_ia32_ldmxcsr(csr); ''' else: return ''
def _add_user_function(self, varname, variable, added): impl = variable.implementations[self.codeobj_class] if (impl.name, variable) in added: return # nothing to do else: added.add((impl.name, variable)) support_code = [] hash_defines = [] pointers = [] kernel_lines = [] user_functions = [(varname, variable)] funccode = impl.get_code(self.owner) if isinstance(funccode, str): # Rename references to any dependencies if necessary for dep_name, dep in impl.dependencies.items(): dep_impl = dep.implementations[self.codeobj_class] dep_impl_name = dep_impl.name if dep_impl_name is None: dep_impl_name = dep.pyfunc.__name__ if dep_name != dep_impl_name: funccode = word_substitute(funccode, {dep_name: dep_impl_name}) ### Different from CPPCodeGenerator: We format the funccode dtypes here from brian2.devices.device import get_device device = get_device() if varname in functions_C99: funccode = funccode.format(default_type=self.default_func_type, other_type=self.other_func_type) elif varname in ['clip', 'exprel']: funccode = funccode.format(float_dtype=self.float_dtype) ### if isinstance(funccode, str): funccode = {'support_code': funccode} if funccode is not None: # To make namespace variables available to functions, we # create global variables and assign to them in the main # code func_namespace = impl.get_namespace(self.owner) or {} for ns_key, ns_value in func_namespace.items(): # This section is adapted from CPPCodeGenerator such that file # global namespace pointers can be used in both host and device # code. assert hasattr(ns_value, 'dtype'), \ 'This should not have happened. Please report at ' \ 'https://github.com/brian-team/brian2cuda/issues/new' if ns_value.shape == (): raise NotImplementedError(( 'Directly replace scalar values in the function ' 'instead of providing them via the namespace')) type_str = self.c_data_type(ns_value.dtype) + '*' namespace_ptr = ''' #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0)) __device__ {dtype} _namespace{name}; #else {dtype} _namespace{name}; #endif '''.format(dtype=type_str, name=ns_key) support_code.append(namespace_ptr) # pointer lines will be used in codeobjects running on the host pointers.append('_namespace{name} = {name};'.format(name=ns_key)) # kernel lines will be used in codeobjects running on the device kernel_lines.append(''' #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0)) _namespace{name} = d{name}; #else _namespace{name} = {name}; #endif '''.format(name=ns_key)) support_code.append(deindent(funccode.get('support_code', ''))) hash_defines.append(deindent(funccode.get('hashdefine_code', ''))) dep_hash_defines = [] dep_pointers = [] dep_support_code = [] dep_kernel_lines = [] if impl.dependencies is not None: for dep_name, dep in impl.dependencies.items(): if dep_name not in self.variables: self.variables[dep_name] = dep dep_impl = dep.implementations[self.codeobj_class] if dep_name != dep_impl.name: self.func_name_replacements[dep_name] = dep_impl.name user_function = self._add_user_function(dep_name, dep, added) if user_function is not None: hd, ps, sc, uf, kl = user_function dep_hash_defines.extend(hd) dep_pointers.extend(ps) dep_support_code.extend(sc) user_functions.extend(uf) dep_kernel_lines.extend(kl) return (dep_hash_defines + hash_defines, dep_pointers + pointers, dep_support_code + support_code, user_functions, dep_kernel_lines + kernel_lines)
[docs] def determine_keywords(self): # set up the restricted pointers, these are used so that the compiler # knows there is no aliasing in the pointers, for optimisation pointers = [] # Add additional lines inside the kernel functions kernel_lines = [] # It is possible that several different variable names refer to the # same array. E.g. in gapjunction code, v_pre and v_post refer to the # same array if a group is connected to itself handled_pointers = set() template_kwds = {} # Again, do the import here to avoid a circular dependency. from brian2.devices.device import get_device device = get_device() for varname, var in self.variables.items(): if isinstance(var, ArrayVariable): # This is the "true" array name, not the restricted pointer. array_name = device.get_array_name(var) pointer_name = self.get_array_name(var) if pointer_name in handled_pointers: continue if getattr(var, 'ndim', 1) > 1: continue # multidimensional (dynamic) arrays have to be treated differently restrict = self.restrict # turn off restricted pointers for scalars for safety if var.scalar: restrict = ' ' # Need to use correct dt type in pointers_lines for single precision, # see #148 if varname == "dt" and prefs.core.default_float_dtype == np.float32: # c_data_type(variable.dtype) is float, but we need double dtype = "double" else: dtype = self.c_data_type(var.dtype) line = '{0}* {1} {2} = {3};'.format(dtype, restrict, pointer_name, array_name) pointers.append(line) handled_pointers.add(pointer_name) # set up the functions user_functions = [] support_code = [] hash_defines = [] added = set() # keep track of functions that were added for varname, variable in list(self.variables.items()): if isinstance(variable, Function): user_func = self._add_user_function(varname, variable, added) if user_func is not None: hd, ps, sc, uf, kl = user_func user_functions.extend(uf) support_code.extend(sc) pointers.extend(ps) hash_defines.extend(hd) kernel_lines.extend(kl) # Generate universal_support_code once when the first codeobject is created. # Can't do it at import time since need access to user preferences # This is a class attribute (not instance attribute). if CUDACodeGenerator.universal_support_code is None: _atomic_support_code = _generate_atomic_support_code() CUDACodeGenerator.universal_support_code = ( _hightype_support_code + _mod_support_code + _floordiv_support_code + _pow_support_code + _atomic_support_code ) support_code.append(CUDACodeGenerator.universal_support_code) # Clock variables (t, dt, timestep) are passed by value to kernels and # need to be translated back into pointers for scalar/vector code. for varname, variable in self.variables.items(): if hasattr(variable, 'owner') and isinstance(variable.owner, Clock): # get arrayname without _ptr suffix (e.g. _array_defaultclock_dt) arrayname = self.get_array_name(variable, prefix='') # kernel_lines appear before dt is cast to float (in scalar_code), hence # we need to still use double (used in kernel parameters), see #148 if varname == "dt" and prefs.core.default_float_dtype == np.float32: # c_data_type(variable.dtype) is float, but we need double dtype = "double" else: dtype = dtype=c_data_type(variable.dtype) line = f"const {dtype}* _ptr{arrayname} = &_value{arrayname};" if line not in kernel_lines: kernel_lines.append(line) keywords = {'pointers_lines': stripped_deindented_lines('\n'.join(pointers)), 'support_code_lines': stripped_deindented_lines('\n'.join(support_code)), 'hashdefine_lines': stripped_deindented_lines('\n'.join(hash_defines)), 'denormals_code_lines': stripped_deindented_lines('\n'.join(self.denormals_to_zero_code())), 'kernel_lines': stripped_deindented_lines('\n'.join(kernel_lines)), 'uses_atomics': self.uses_atomics } keywords.update(template_kwds) return keywords
[docs]class CUDAAtomicsCodeGenerator(CUDACodeGenerator): _use_atomics = True
################################################################################ # Implement functions ################################################################################ # The CUDA Math library suports all C99 standard float and double math functions. # To have support for integral types in device code, we need to wrap these functions # and convert integral types to floating-point types. In host code, we can just use # the standard math functions directly. ### Functions available in the C99 standard functions_C99 = [] func_translations = [] # Functions that exist under the same name in C++ for func in ['sin', 'cos', 'tan', 'sinh', 'cosh', 'tanh', 'exp', 'log', 'log10', 'expm1', 'log1p', 'sqrt', 'ceil', 'floor']: func_translations.append((func, func)) # Functions that exist under a different name in C++ for func, func_cuda in [('arcsin', 'asin'), ('arccos', 'acos'), ('arctan', 'atan')]: func_translations.append((func, func_cuda)) for func, func_cuda in func_translations: functions_C99.append(func) cuda_code = ''' template <typename T> __host__ __device__ {{default_type}} _brian_{func}(T value) {{{{ #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0)) return {func}(({{default_type}})value); #else return {func}(value); #endif }}}} inline __host__ __device__ {{other_type}} _brian_{func}({{other_type}} value) {{{{ return {func}(value); }}}} '''.format(func=func_cuda) # {default_type} and {other_type} will be formatted in CUDACodeGenerator.determine_keywords() # depending on user prefs (which are not yet set when this code snippet is created) DEFAULT_FUNCTIONS[func].implementations.add_implementation(CUDACodeGenerator, code=cuda_code, name=f'_brian_{func_cuda}' ) # TODO: make float version type safe or print warning (see #233) exprel_code = ''' __host__ __device__ static inline {float_dtype} _brian_exprel({float_dtype} x) {{ if (fabs(x) < 1e-16) return 1.0; if (x > 717) return INFINITY; return expm1(x)/x; }} ''' DEFAULT_FUNCTIONS['exprel'].implementations.add_implementation(CUDACodeGenerator, code=exprel_code, name='_brian_exprel') # std::abs is available and already overloaded for integral types in device code abs_code = ''' #define _brian_abs std::abs ''' DEFAULT_FUNCTIONS['abs'].implementations.add_implementation(CUDACodeGenerator, code=abs_code, name='_brian_abs') ### Functions that need to be implemented specifically int_code = ''' template <typename T> __host__ __device__ int _brian_int(T value) { return (int)value; } template <> inline __host__ __device__ int _brian_int(bool value) { return value ? 1 : 0; } ''' DEFAULT_FUNCTIONS['int'].implementations.add_implementation(CUDACodeGenerator, code=int_code, name='_brian_int') # TODO: make float version type safe or print warning (see #51) clip_code = ''' inline __host__ __device__ {float_dtype} _brian_clip(const {float_dtype} value, const {float_dtype} a_min, const {float_dtype} a_max) {{ if (value < a_min) return a_min; if (value > a_max) return a_max; return value; }} ''' DEFAULT_FUNCTIONS['clip'].implementations.add_implementation(CUDACodeGenerator, code=clip_code, name='_brian_clip') sign_code = ''' template <typename T> __host__ __device__ int _brian_sign(T val) { return (T(0) < val) - (val < T(0)); } ''' DEFAULT_FUNCTIONS['sign'].implementations.add_implementation(CUDACodeGenerator, code=sign_code, name='_brian_sign') randn_code = ''' #define _randn(vectorisation_idx) (_ptr_array_%CODEOBJ_NAME%_randn[vectorisation_idx]) ''' DEFAULT_FUNCTIONS['randn'].implementations.add_implementation(CUDACodeGenerator, code=randn_code, name='_randn') rand_code = ''' #define _rand(vectorisation_idx) (_ptr_array_%CODEOBJ_NAME%_rand[vectorisation_idx]) ''' DEFAULT_FUNCTIONS['rand'].implementations.add_implementation(CUDACodeGenerator, code=rand_code, name='_rand') poisson_code = ''' // Notes on the poisson function implementation: // - Curand generates unsigned int, brian uses int32_t, casting is happening here // - The only codeobject that uses host side random numbers is // synapses_create_generator.cu. There, a C++ poisson function (same as used // in C++ Standalone) is implemented and used via _host_poisson(...) // - For synapses_create_generator.cu tempaltes, the _poisson calls are unchanged // (no regex substitution) and hence just use the C++ implementation // - For device side poisson, we have two cases. // - If the lambda is constant across units of the codeobject (scalar lambda), // we use host side RNG in rand.cu and pass the data pointer to _poisson // (first overloaded definition). // - If the lambda is a variable itself that might be different for each unit // in the codeobject (vectorized lambda), we instead use device side RNG // (device code path in the second overloaded _poisson definition). __device__ int32_t _poisson(unsigned int* _poisson_buffer, int32_t _idx) { // poisson with constant lambda are generated with curand host API in rand.cu return (int32_t) _poisson_buffer[_idx]; } __host__ __device__ int32_t _poisson(double _lambda, int32_t _idx) { #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0)) if (_lambda <= 0) { return 0; } else { // poisson with variable lambda are generated with curand device API on-the-fly curandState localState = brian::d_curand_states[_idx]; unsigned int poisson_number = curand_poisson(&localState, _lambda); brian::d_curand_states[_idx] = localState; return (int32_t) poisson_number; } #else // use C++ implementation defined in synapses_create_generator.cu return _host_poisson(_lambda, _idx); #endif } ''' DEFAULT_FUNCTIONS['poisson'].implementations.add_implementation( CUDACodeGenerator, code=poisson_code, name='_poisson', compiler_kwds={"headers": ["<curand.h>"]} ) # Add support for the `timestep` function added in Brian 2.3.1 timestep_code = ''' __host__ __device__ static inline int64_t _timestep(double t, double dt) { return (int64_t)((t + 1e-3*dt)/dt); } ''' DEFAULT_FUNCTIONS['timestep'].implementations.add_implementation(CUDACodeGenerator, code=timestep_code, name='_timestep')