Source code for brian2cuda.cuda_generator

import itertools

import numpy as np
import re

from brian2.utils.stringtools import (deindent, stripped_deindented_lines,
                                      word_substitute)
from brian2.utils.logger import get_logger
from brian2.utils.stringtools import get_identifiers
from brian2.parsing.rendering import CPPNodeRenderer
from brian2.core.functions import Function, DEFAULT_FUNCTIONS
from brian2.core.clocks import Clock
from brian2.core.preferences import prefs
from brian2.core.variables import ArrayVariable
from brian2.codegen.generators.cpp_generator import c_data_type
from brian2.codegen.generators.base import CodeGenerator
from brian2.devices import get_device
from brian2cuda.utils.gputools import get_cuda_runtime_version


__all__ = ['CUDACodeGenerator', 'CUDAAtomicsCodeGenerator', 'c_data_type']


logger = get_logger('brian2.codegen.generators.cuda_generator')


[docs]class ParallelisationError(Exception):
    pass


# This is a function since this code needs to be generated after user preferences are
# set (for GPU selection)
def _generate_atomic_support_code():
    #            (arg_dtype, int_dtype, name in type cast function)
    overloads = [('int', 'int', 'int'),
                 ('float', 'int', 'int'),
                 ('double', 'unsigned long long int', 'longlong')]

    cuda_runtime_version = get_cuda_runtime_version()

    # Note: There are atomic functions that are supported only for compute capability >=
    # 3.5. We don't check for those as we require at least 3.5. If we ever support
    # smaller CC, we need to adapt the code here (see Atomic Functions in CUDA
    # Programming Guide)
    device = get_device()
    assert device.minimal_compute_capability >= 3.5, "Need to adapt atomic support code"

    software_implementation_float_dtype = '''
        // software implementation
        {int_dtype}* address_as_int = ({int_dtype}*)address;
        {int_dtype} old = *address_as_int, assumed;

        do {{
            assumed = old;
            old = atomicCAS(address_as_int, assumed,
                            __{arg_dtype}_as_{val_type_cast}(val {op}
                                   __{val_type_cast}_as_{arg_dtype}(assumed)));

        // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
        }} while (assumed != old);

        return __{val_type_cast}_as_{arg_dtype}(old);
        '''

    hardware_implementation = '''
        // hardware implementation
        return atomic{op_name}(address, val);
        '''

    atomic_support_code = ''
    for op_name, op in [('Add', '+'), ('Mul', '*'), ('Div', '/')]:
        for arg_dtype, int_dtype, val_type_cast in overloads:
            # atomicAdd had hardware implementations, depending on dtype, compute
            # capability and CUDA runtime version. This section uses them when possible.
            if op_name == 'Add':
                format_sw = True
                code = '''
                inline __device__ {arg_dtype} _brian_atomic{op_name}({arg_dtype}* address, {arg_dtype} val)
                {{
                '''
                if arg_dtype in ['int', 'float']:
                    code += f'''
                    {hardware_implementation}
                    '''
                elif arg_dtype == 'double':
                    if cuda_runtime_version >= 8.0:
                        # Check for CC in at runtime to use software or hardware
                        # implementation.
                        # Don't need to check for defined __CUDA__ARCH__, it is always defined
                        # in __device__ and __global__ functions (no host code path).
                        code += '''
                        #if (__CUDA_ARCH__ >= 600)
                        {hardware_implementation}
                        #else
                        {software_implementation}
                        #endif
                        '''.format(
                            hardware_implementation=hardware_implementation,
                            software_implementation=software_implementation_float_dtype
                        )
                    else:
                        # For runtime < 8.0, there is no atomicAdd hardware implementation
                        # support independent of CC.
                        code += '''
                        {software_implementation}
                        '''.format(
                            software_implementation=software_implementation_float_dtype
                        )
                        format_sw = True
                code += '''
                }}
                '''
                if format_sw:
                    code = code.format(
                        arg_dtype=arg_dtype, int_dtype=int_dtype,
                        val_type_cast=val_type_cast, op_name=op_name, op=op
                    )
                else:
                    code = code.format(arg_dtype=arg_dtype, op_name=op_name)

                atomic_support_code += code
            # For other atomic operations on int types, we can use the same template.
            elif arg_dtype == 'int':
                atomic_support_code += '''
                inline __device__ int _brian_atomic{op_name}(int* address, int val)
                {{
                    // software implementation
                    int old = *address, assumed;

                    do {{
                        assumed = old;
                        old = atomicCAS(address, assumed, val {op} assumed);
                    }} while (assumed != old);

                    return old;
                }}
                '''.format(op_name=op_name, op=op)
            # Else (not atomicAdd and not int types) use this template.
            else:
                # in the software implementation, we treat all data as integer types
                # (since atomicCAS is only define
                # and use atomicCAS to swap the memory with our our desired value
                code = '''
                inline __device__ {{arg_dtype}} _brian_atomic{{op_name}}({{arg_dtype}}* address, {{arg_dtype}} val)
                {{{{
                    {software_implementation}
                }}}}
                '''.format(software_implementation=software_implementation_float_dtype)
                # above, just add the software_implementation code, below add the other
                # format variables (also present in software_implementation)
                code = code.format(
                    arg_dtype=arg_dtype, int_dtype=int_dtype,
                    val_type_cast=val_type_cast, op_name=op_name, op=op,
                )
                atomic_support_code += code

    return atomic_support_code


# CUDA does not support modulo arithmetics for long double. Since we can't give a warning, we let the
# compilation fail, which gives an error message of type
# error: more than one instance of overloaded function "_brian_mod" matches the argument list: ...
# TODO: can we produce a more informative error message?
_typestrs = ['int', 'long', 'long long', 'float', 'double']#, 'long double']
_hightype_support_code = 'template < typename T1, typename T2 > struct _higher_type;\n'
for ix, xtype in enumerate(_typestrs):
    for iy, ytype in enumerate(_typestrs):
        hightype = _typestrs[max(ix, iy)]
        _hightype_support_code += '''
template < > struct _higher_type<{xtype},{ytype}> {{ typedef {hightype} type; }};
        '''.format(hightype=hightype, xtype=xtype, ytype=ytype)

_mod_support_code = '''
template < typename T1, typename T2 >
__host__ __device__ static inline typename _higher_type<T1,T2>::type
_brian_mod(T1 x, T2 y)
{{
    return x-y*floor(1.0*x/y);
}}
'''

_floordiv_support_code = '''
template < typename T1, typename T2 >
__host__ __device__ static inline typename _higher_type<T1,T2>::type
_brian_floordiv(T1 x, T2 y)
{{
    return floor(1.0*x/y);
}}
'''

_pow_support_code = '''
#ifdef _MSC_VER
#define _brian_pow(x, y) (pow((double)(x), (y)))
#else
#define _brian_pow(x, y) (pow((x), (y)))
#endif
'''


[docs]class CUDACodeGenerator(CodeGenerator):
    '''
    C++ language with CUDA library

    CUDA code templates should provide Jinja2 macros with the following names:

    ``main``
        The main loop.
    ``support_code``
        The support code (function definitions, etc.), compiled in a separate
        file.

    For user-defined functions, there are two keys to provide:

    ``support_code``
        The function definition which will be added to the support code.
    ``hashdefine_code``
        The ``#define`` code added to the main loop.

    See `TimedArray` for an example of these keys.
    '''

    class_name = 'cuda'

    _use_atomics = False

    # This will be generated when the first class instance calls `determine_keywords()`
    universal_support_code = None

    def __init__(self, *args, **kwds):
        super(CUDACodeGenerator, self).__init__(*args, **kwds)
        self.c_data_type = c_data_type
        self.warned_integral_convertion = False
        self.previous_convertion_pref = None
        self.uses_atomics = False

        # Set convertion types for standard C99 functions in device code
        # These are used in _add_user_function to format the function code
        if prefs.devices.cuda_standalone.default_functions_integral_convertion == np.float64:
            self.default_func_type = 'double'
            self.other_func_type = 'float'
        else:  # np.float32
            self.default_func_type = 'float'
            self.other_func_type = 'double'
        # set clip function to either use all float or all double arguments
        # see #51 for details
        if prefs['core.default_float_dtype'] == np.float64:
            self.float_dtype = 'double'
        else:  # np.float32
            self.float_dtype = 'float'


    @property
    def restrict(self):
        return prefs['codegen.generators.cpp.restrict_keyword'] + ' '

    @property
    def flush_denormals(self):
        return prefs['codegen.generators.cpp.flush_denormals']

[docs]    @staticmethod
    def get_array_name(var, access_data=True, prefix=None):
        '''
        Return a globally unique name for `var`.
        See CUDAStandaloneDevice.get_array_name for parameters.

        Here, `prefix` defaults to `'_ptr'` when `access_data=True`.

        `prefix='_ptr'` is used since the CUDACodeGenerator generates the
        `scalar_code` and `vector_code` snippets.
        '''
        # We have to do the import here to avoid circular import dependencies.
        from brian2.devices.device import get_device
        device = get_device()
        if access_data:
            if prefix is None:
                prefix = '_ptr'
            return device.get_array_name(var, access_data=True, prefix=prefix)
        else:
            return device.get_array_name(var, access_data=False, prefix=prefix)

[docs]    def translate_expression(self, expr):
        expr = word_substitute(expr, self.func_name_replacements)
        return CPPNodeRenderer(auto_vectorise=self.auto_vectorise).render_expr(expr).strip()

[docs]    def translate_statement(self, statement):
        var, op, expr, comment = (statement.var, statement.op,
                                  statement.expr, statement.comment)
        # For C++ we replace complex expressions involving boolean variables into a sequence of
        # if/then expressions with simpler expressions. This is provided by the optimise_statements
        # function.
        if statement.used_boolean_variables is not None and len(statement.used_boolean_variables):
            used_boolvars = statement.used_boolean_variables
            bool_simp = statement.boolean_simplified_expressions
            if op == ':=':
                # we have to declare the variable outside the if/then statement (which
                # unfortunately means we can't make it const but the optimisation is worth
                # it anyway).
                codelines = [self.c_data_type(statement.dtype) + ' ' + var + ';']
                op = '='
            else:
                codelines = []
            firstline = True
            # bool assigns is a sequence of (var, value) pairs giving the conditions under
            # which the simplified expression simp_expr holds
            for bool_assigns, simp_expr in bool_simp.items():
                # generate a boolean expression like ``var1 && var2 && !var3``
                atomics = []
                for boolvar, boolval in bool_assigns:
                    if boolval:
                        atomics.append(boolvar)
                    else:
                        atomics.append('!'+boolvar)
                if firstline:
                    line = ''
                else:
                    line = 'else '
                # only need another if statement when we have more than one boolean variables
                if firstline or len(used_boolvars)>1:
                    line += 'if('+(' && '.join(atomics))+')'
                line += '\n    '
                line += var + ' ' + op + ' ' + self.translate_expression(simp_expr) + ';'
                codelines.append(line)
                firstline = False
            code = '\n'.join(codelines)
        else:
            if op == ':=':
                decl = self.c_data_type(statement.dtype) + ' '
                op = '='
                if statement.constant:
                    decl = 'const ' + decl
            else:
                decl = ''
            code = decl + var + ' ' + op + ' ' + self.translate_expression(expr) + ';'
        if len(comment):
            code += ' // ' + comment
        return code

[docs]    def translate_to_read_arrays(self, read, write, indices):
        lines = []
        # index and read arrays (index arrays first)
        for varname in itertools.chain(sorted(indices), sorted(read)):
            index_var = self.variable_indices[varname]
            var = self.variables[varname]
            if varname not in write:
                line = 'const '
            else:
                line = ''
            line = line + self.c_data_type(var.dtype) + ' ' + varname + ' = '
            line = line + self.get_array_name(var) + '[' + index_var + '];'
            lines.append(line)
        return lines

[docs]    def translate_to_declarations(self, read, write, indices):
        lines = []
        # simply declare variables that will be written but not read
        for varname in sorted(write):
            if varname not in read and varname not in indices:
                var = self.variables[varname]
                line = self.c_data_type(var.dtype) + ' ' + varname + ';'
                lines.append(line)
        return lines

[docs]    def conditional_write(self, line, statement, conditional_write_vars):
        lines = []
        if statement.var in conditional_write_vars:
            subs = {}
            condvar = conditional_write_vars[statement.var]
            lines.append(f'if({condvar})')
            lines.append('    ' + line)
        else:
            lines.append(line)
        return lines

[docs]    def translate_to_statements(self, statements, conditional_write_vars):
        lines = []
        # the actual code
        for stmt in statements:
            line = self.translate_statement(stmt)
            lines.extend(self.conditional_write(line, stmt,
                                                conditional_write_vars))
        return lines

[docs]    def translate_to_write_arrays(self, write):
        lines = []
        # write arrays
        for varname in sorted(write):
            index_var = self.variable_indices[varname]
            var = self.variables[varname]
            line = self.get_array_name(var) + '[' + index_var + '] = ' + varname + ';'
            lines.append(line)
        return lines

[docs]    def translate_one_statement_sequence(self, statements, scalar=False):
        # This function is refactored into four functions which perform the
        # four necessary operations. It's done like this so that code
        # deriving from this class can overwrite specific parts.
        all_unique = not self.has_repeated_indices(statements)

        read, write, indices, conditional_write_vars = self.arrays_helper(statements)
        try:
            # try to use atomics
            if not self._use_atomics or scalar or all_unique:
                raise ParallelisationError()
            # more complex translations to deal with repeated indices, which
            # could lead to race conditions when applied in parallel
            lines = self.parallelise_code(statements)
            self.uses_atomics = True
        except ParallelisationError:
            # don't use atomics
            lines = []
            # index and read arrays (index arrays first)
            lines += self.translate_to_read_arrays(read, write, indices)
            # simply declare variables that will be written but not read
            lines += self.translate_to_declarations(read, write, indices)
            # the actual code
            lines += self.translate_to_statements(statements,
                                                  conditional_write_vars)
            # write arrays
            lines += self.translate_to_write_arrays(write)
        code = '\n'.join(lines)

        # Check if 64bit integer types occur in the same line as a default function.
        # We can't get the arguments of the function call directly with regex due to
        # possibly nested paranthesis inside function paranthesis.
        statement_lines = self.translate_to_statements(statements,
                                                       conditional_write_vars)
        convertion_pref = prefs.devices.cuda_standalone.default_functions_integral_convertion
        # only check if there was no warning yet or if convertion preference has changed
        if not self.warned_integral_convertion or self.previous_convertion_pref != convertion_pref:
            for line in statement_lines:
                brian_funcs = re.search('_brian_(' + '|'.join(functions_C99) + ')', line)
                if brian_funcs is not None:
                    for identifier in get_identifiers(line):
                        if convertion_pref == np.float64:
                            # 64bit integer to floating-point conversions are not type safe
                            int64_type = re.search(r'\bu?int64_t\s*{}\b'.format(identifier), code)
                            if int64_type is not None:
                                logger.warn("Detected code statement with default function and 64bit integer type in the same line. "
                                            "Using 64bit integer types as default function arguments is not type safe due to convertion of "
                                            "integer to 64bit floating-point types in device code. (relevant functions: {})\nDetected code "
                                            "statement:\n\t{}\nGenerated from abstract code statements:\n\t{}\n".format(
                                                ', '.join(functions_C99), line, statements
                                            ),
                                            once=True)
                                self.warned_integral_convertion = True
                                self.previous_convertion_pref = np.float64
                        else:  # convertion_pref = np.float32
                            # 32bit and 64bit integer to floating-point conversions are not type safe
                            int32_64_type = re.search(r'\bu?int(32|64)_t\s*{}\b'.format(identifier), code)
                            if int32_64_type is not None:
                                logger.warn("Detected code statement with default function and 32bit or 64bit integer type in the same line and the "
                                            "preference for default_functions_integral_convertion is 'float32'. "
                                            "Using 32bit or 64bit integer types as default function arguments is not type safe due to convertion of "
                                            "integer to single-precision floating-point types in device code. (relevant functions: {})\nDetected code "
                                            "statement:\n\t{}\nGenerated from abstract code statements:\n\t{}\n".format(
                                                ', '.join(functions_C99), line, statements
                                            ),
                                            once=True)
                                self.warned_integral_convertion = True
                                self.previous_convertion_pref = np.float32
        return stripped_deindented_lines(code)

[docs]    def atomics_parallelisation(self, statement, conditional_write_vars,
                                used_variables):
        if not self._use_atomics:
            raise ParallelisationError()
        # Avoids circular import
        from brian2.devices.device import device

        # See https://github.com/brian-team/brian2/pull/531 for explanation
        used = set(get_identifiers(statement.expr))
        used = used.intersection(k for k in self.variables.keys()
                                 if k in self.variable_indices
                                 and self.variable_indices[k] != '_idx')
        used_variables.update(used)
        if statement.var in used_variables:
            raise ParallelisationError()

        expr = self.translate_expression(statement.expr)

        if statement.op == ':=' or self.variable_indices[statement.var] == '_idx' or not statement.inplace:
            if statement.op == ':=':
                decl = self.c_data_type(statement.dtype) + ' '
                op = '='
                if statement.constant:
                    decl = 'const ' + decl
            else:
                decl = ''
                op = statement.op
            line = f'{decl}{statement.var} {op} {expr};'
            line = [line]
        elif statement.inplace:
            sign = ''
            if statement.op == '+=':
                atomic_op = '_brian_atomicAdd'
            elif statement.op == '-=':
                # CUDA has hardware implementations for float (and for CC>=6.0
                # for double) only for atomicAdd, which is faster then our
                # software implementation
                atomic_op = '_brian_atomicAdd'
                sign = '-'
            elif statement.op == '*=':
                atomic_op = '_brian_atomicMul'
            elif statement.op == '/=':
                atomic_op = '_brian_atomicDiv'
            else:
                # TODO: what other inplace operations are possible? Can we
                # implement them with atomicCAS ?
                logger.info("Atomic operation for operation {op} is not implemented."
                            "".format(op=statement.op))
                raise ParallelisationError()

            line = '{atomic_op}(&{array_name}[{idx}], ({array_dtype}){sign}({expr}));'.format(
                atomic_op=atomic_op,
                array_name=self.get_array_name(self.variables[statement.var]),
                idx=self.variable_indices[statement.var],
                array_dtype=c_data_type(self.variables[statement.var].dtype),
                sign=sign, expr=expr)
            # this is now a list of 1 or 2 lines (potentially with if(...))
            line = self.conditional_write(line, statement, conditional_write_vars)
        else:
            raise ParallelisationError()

        if len(statement.comment):
            line[-1] += ' // ' + statement.comment

        return line

[docs]    def parallelise_code(self, statements):
        try:
            used_variables = set()
            all_read, all_write, all_indices, _ = self.arrays_helper(statements)
            lines = []
            # we are collecting all reads, which are not only loaded because
            # of in-place atomic operations and all writes, which are not
            # written to by atomics
            collected_reads = set()
            collected_writes = set()
            atomic_lines = []
            for stmt in statements:
                lines.append(f'//  Abstract code:  {stmt.var} {stmt.op} {stmt.expr}')
                # We treat every statement individually with its own read and write code
                # to be on the safe side
                read, write, indices, conditional_write_vars = self.arrays_helper([stmt])
                # No need to load a variable if it is only in read because of
                # the in-place operation, but still load the index var, we
                # need it for the atomics
                if (stmt.inplace and
                        self.variable_indices[stmt.var] != '_idx' and
                        stmt.var not in get_identifiers(stmt.expr)):
                    read = read - {stmt.var}
                collected_reads = collected_reads.union(read)
                atomic_lines.extend(self.atomics_parallelisation(stmt,
                                                                 conditional_write_vars,
                                                                 used_variables))
                # Do not write back such values, the atomic functions have
                # modified the underlying array already
                if stmt.inplace and self.variable_indices[stmt.var] != '_idx':
                    write = write - {stmt.var}
                collected_writes = collected_writes.union(write)

            # we translate to read arrays altogether, otherwise we end up with
            # multiple declarations for statement identifiers

            # pass all_write here, since this determines the `const` keyword
            # (otherwise the variables written to by atomics would be `const`)
            # and all_indices, since we need those for the atomic calls
            lines.extend(self.translate_to_read_arrays(collected_reads,
                                                       all_write,
                                                       all_indices))
            # we need to declare variables which will be written to but not
            # read (e.g. in assigmenets like `v_post = 1`) and which are not
            # used in atomics (hence we use `collected_writes`)
            lines.extend(self.translate_to_declarations(all_read,
                                                        collected_writes,
                                                        all_indices))
            # add the atomic operations and statements
            lines.extend(atomic_lines)
            # only write variables which are not written to by atomics
            lines.extend(self.translate_to_write_arrays(collected_writes))
        except ParallelisationError:
            # logg info here, since this means we tried, but failed to use atomics
            logger.info("Failed to parallelise code by using atomic operations. "
                        "Falling back to serialized effect application. This "
                        "might be slow. Switching to `cpp_standalone` might be "
                        "faster. Code object name is {} and first line in "
                        "abstract code is: {}".format(self.name, statements[0]),
                        once=True)
            raise

        return lines


[docs]    def denormals_to_zero_code(self):
        if self.flush_denormals:
            return '''
            #define CSR_FLUSH_TO_ZERO         (1 << 15)
            unsigned csr = __builtin_ia32_stmxcsr();
            csr |= CSR_FLUSH_TO_ZERO;
            __builtin_ia32_ldmxcsr(csr);
            '''
        else:
            return ''

    def _add_user_function(self, varname, variable, added):
        impl = variable.implementations[self.codeobj_class]
        if (impl.name, variable) in added:
            return  # nothing to do
        else:
            added.add((impl.name, variable))
        support_code = []
        hash_defines = []
        pointers = []
        kernel_lines = []
        user_functions = [(varname, variable)]
        funccode = impl.get_code(self.owner)
        if isinstance(funccode, str):
            # Rename references to any dependencies if necessary
            for dep_name, dep in impl.dependencies.items():
                dep_impl = dep.implementations[self.codeobj_class]
                dep_impl_name = dep_impl.name
                if dep_impl_name is None:
                    dep_impl_name = dep.pyfunc.__name__
                if dep_name != dep_impl_name:
                    funccode = word_substitute(funccode, {dep_name: dep_impl_name})

        ### Different from CPPCodeGenerator: We format the funccode dtypes here
        from brian2.devices.device import get_device
        device = get_device()
        if varname in functions_C99:
            funccode = funccode.format(default_type=self.default_func_type,
                                       other_type=self.other_func_type)
        elif varname in ['clip', 'exprel']:
            funccode = funccode.format(float_dtype=self.float_dtype)
        ###

        if isinstance(funccode, str):
            funccode = {'support_code': funccode}
        if funccode is not None:
            # To make namespace variables available to functions, we
            # create global variables and assign to them in the main
            # code
            func_namespace = impl.get_namespace(self.owner) or {}
            for ns_key, ns_value in func_namespace.items():
                # This section is adapted from CPPCodeGenerator such that file
                # global namespace pointers can be used in both host and device
                # code.
                assert hasattr(ns_value, 'dtype'), \
                    'This should not have happened. Please report at ' \
                    'https://github.com/brian-team/brian2cuda/issues/new'
                if ns_value.shape == ():
                    raise NotImplementedError((
                    'Directly replace scalar values in the function '
                    'instead of providing them via the namespace'))
                type_str = self.c_data_type(ns_value.dtype) + '*'
                namespace_ptr = '''
                    #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0))
                    __device__ {dtype} _namespace{name};
                    #else
                    {dtype} _namespace{name};
                    #endif
                    '''.format(dtype=type_str, name=ns_key)
                support_code.append(namespace_ptr)
                # pointer lines will be used in codeobjects running on the host
                pointers.append('_namespace{name} = {name};'.format(name=ns_key))
                # kernel lines will be used in codeobjects running on the device
                kernel_lines.append('''
                    #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0))
                    _namespace{name} = d{name};
                    #else
                    _namespace{name} = {name};
                    #endif
                    '''.format(name=ns_key))
            support_code.append(deindent(funccode.get('support_code', '')))
            hash_defines.append(deindent(funccode.get('hashdefine_code', '')))

        dep_hash_defines = []
        dep_pointers = []
        dep_support_code = []
        dep_kernel_lines = []
        if impl.dependencies is not None:
            for dep_name, dep in impl.dependencies.items():
                if dep_name not in self.variables:
                    self.variables[dep_name] = dep
                    dep_impl = dep.implementations[self.codeobj_class]
                    if dep_name != dep_impl.name:
                        self.func_name_replacements[dep_name] = dep_impl.name
                    user_function = self._add_user_function(dep_name, dep, added)
                    if user_function is not None:
                        hd, ps, sc, uf, kl = user_function
                        dep_hash_defines.extend(hd)
                        dep_pointers.extend(ps)
                        dep_support_code.extend(sc)
                        user_functions.extend(uf)
                        dep_kernel_lines.extend(kl)

        return (dep_hash_defines + hash_defines,
                dep_pointers + pointers,
                dep_support_code + support_code,
                user_functions,
                dep_kernel_lines + kernel_lines)

[docs]    def determine_keywords(self):
        # set up the restricted pointers, these are used so that the compiler
        # knows there is no aliasing in the pointers, for optimisation
        pointers = []
        # Add additional lines inside the kernel functions
        kernel_lines = []
        # It is possible that several different variable names refer to the
        # same array. E.g. in gapjunction code, v_pre and v_post refer to the
        # same array if a group is connected to itself
        handled_pointers = set()
        template_kwds = {}
        # Again, do the import here to avoid a circular dependency.
        from brian2.devices.device import get_device
        device = get_device()
        for varname, var in self.variables.items():
            if isinstance(var, ArrayVariable):
                # This is the "true" array name, not the restricted pointer.
                array_name = device.get_array_name(var)
                pointer_name = self.get_array_name(var)
                if pointer_name in handled_pointers:
                    continue
                if getattr(var, 'ndim', 1) > 1:
                    continue  # multidimensional (dynamic) arrays have to be treated differently
                restrict = self.restrict
                # turn off restricted pointers for scalars for safety
                if var.scalar:
                    restrict = ' '
                # Need to use correct dt type in pointers_lines for single precision,
                # see #148
                if varname == "dt" and prefs.core.default_float_dtype == np.float32:
                    # c_data_type(variable.dtype) is float, but we need double
                    dtype = "double"
                else:
                    dtype = self.c_data_type(var.dtype)
                line = '{0}* {1} {2} = {3};'.format(dtype,
                                                    restrict,
                                                    pointer_name,
                                                    array_name)
                pointers.append(line)
                handled_pointers.add(pointer_name)

        # set up the functions
        user_functions = []
        support_code = []
        hash_defines = []
        added = set()  # keep track of functions that were added
        for varname, variable in list(self.variables.items()):
            if isinstance(variable, Function):
                user_func = self._add_user_function(varname, variable, added)
                if user_func is not None:
                    hd, ps, sc, uf, kl = user_func
                    user_functions.extend(uf)
                    support_code.extend(sc)
                    pointers.extend(ps)
                    hash_defines.extend(hd)
                    kernel_lines.extend(kl)

        # Generate universal_support_code once when the first codeobject is created.
        # Can't do it at import time since need access to user preferences
        # This is a class attribute (not instance attribute).
        if CUDACodeGenerator.universal_support_code is None:
            _atomic_support_code = _generate_atomic_support_code()
            CUDACodeGenerator.universal_support_code = (
                _hightype_support_code
                + _mod_support_code
                + _floordiv_support_code
                + _pow_support_code
                + _atomic_support_code
            )
        support_code.append(CUDACodeGenerator.universal_support_code)

        # Clock variables (t, dt, timestep) are passed by value to kernels and
        # need to be translated back into pointers for scalar/vector code.
        for varname, variable in self.variables.items():
            if hasattr(variable, 'owner') and isinstance(variable.owner, Clock):
                # get arrayname without _ptr suffix (e.g. _array_defaultclock_dt)
                arrayname = self.get_array_name(variable, prefix='')
                # kernel_lines appear before dt is cast to float (in scalar_code), hence
                # we need to still use double (used in kernel parameters), see #148
                if varname == "dt" and prefs.core.default_float_dtype == np.float32:
                    # c_data_type(variable.dtype) is float, but we need double
                    dtype = "double"
                else:
                    dtype = dtype=c_data_type(variable.dtype)
                line = f"const {dtype}* _ptr{arrayname} = &_value{arrayname};"
                if line not in kernel_lines:
                    kernel_lines.append(line)

        keywords = {'pointers_lines': stripped_deindented_lines('\n'.join(pointers)),
                    'support_code_lines': stripped_deindented_lines('\n'.join(support_code)),
                    'hashdefine_lines': stripped_deindented_lines('\n'.join(hash_defines)),
                    'denormals_code_lines': stripped_deindented_lines('\n'.join(self.denormals_to_zero_code())),
                    'kernel_lines': stripped_deindented_lines('\n'.join(kernel_lines)),
                    'uses_atomics': self.uses_atomics
                    }
        keywords.update(template_kwds)
        return keywords


[docs]class CUDAAtomicsCodeGenerator(CUDACodeGenerator):

    _use_atomics = True

################################################################################
# Implement functions
################################################################################

# The CUDA Math library suports all C99 standard float and double math functions.
# To have support for integral types in device code, we need to wrap these functions
# and convert integral types to floating-point types. In host code, we can just use
# the standard math functions directly.

### Functions available in the C99 standard
functions_C99 = []
func_translations = []
# Functions that exist under the same name in C++
for func in ['sin', 'cos', 'tan', 'sinh', 'cosh', 'tanh', 'exp', 'log',
             'log10', 'expm1', 'log1p', 'sqrt', 'ceil', 'floor']:
    func_translations.append((func, func))

# Functions that exist under a different name in C++
for func, func_cuda in [('arcsin', 'asin'), ('arccos', 'acos'), ('arctan', 'atan')]:
    func_translations.append((func, func_cuda))

for func, func_cuda in func_translations:
    functions_C99.append(func)
    cuda_code = '''
        template <typename T>
        __host__ __device__
        {{default_type}} _brian_{func}(T value)
        {{{{
        #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0))
            return {func}(({{default_type}})value);
        #else
            return {func}(value);
        #endif
        }}}}
        inline __host__ __device__
        {{other_type}} _brian_{func}({{other_type}} value)
        {{{{
            return {func}(value);
        }}}}
        '''.format(func=func_cuda)
        # {default_type} and {other_type} will be formatted in CUDACodeGenerator.determine_keywords()
        # depending on user prefs (which are not yet set when this code snippet is created)
    DEFAULT_FUNCTIONS[func].implementations.add_implementation(CUDACodeGenerator,
                                                               code=cuda_code,
                                                               name=f'_brian_{func_cuda}'
                                                               )

# TODO: make float version type safe or print warning (see #233)
exprel_code = '''
__host__ __device__
static inline {float_dtype} _brian_exprel({float_dtype} x)
{{
    if (fabs(x) < 1e-16)
        return 1.0;
    if (x > 717)
        return INFINITY;
    return expm1(x)/x;
}}
'''
DEFAULT_FUNCTIONS['exprel'].implementations.add_implementation(CUDACodeGenerator,
                                                               code=exprel_code,
                                                               name='_brian_exprel')

# std::abs is available and already overloaded for integral types in device code
abs_code = '''
    #define _brian_abs std::abs
    '''
DEFAULT_FUNCTIONS['abs'].implementations.add_implementation(CUDACodeGenerator,
                                                            code=abs_code,
                                                            name='_brian_abs')

### Functions that need to be implemented specifically
int_code = '''
    template <typename T>
    __host__ __device__
    int _brian_int(T value)
    {
        return (int)value;
    }
    template <>
    inline __host__ __device__
    int _brian_int(bool value)
    {
        return value ? 1 : 0;
    }
    '''

DEFAULT_FUNCTIONS['int'].implementations.add_implementation(CUDACodeGenerator,
                                                            code=int_code,
                                                            name='_brian_int')

# TODO: make float version type safe or print warning (see #51)
clip_code = '''
    inline __host__ __device__
    {float_dtype} _brian_clip(const {float_dtype} value,
                              const {float_dtype} a_min,
                              const {float_dtype} a_max)
    {{
        if (value < a_min)
            return a_min;
        if (value > a_max)
            return a_max;
        return value;
    }}
    '''
DEFAULT_FUNCTIONS['clip'].implementations.add_implementation(CUDACodeGenerator,
                                                             code=clip_code,
                                                             name='_brian_clip')

sign_code = '''
    template <typename T>
    __host__ __device__
    int _brian_sign(T val)
    {
        return (T(0) < val) - (val < T(0));
    }
    '''
DEFAULT_FUNCTIONS['sign'].implementations.add_implementation(CUDACodeGenerator,
                                                             code=sign_code,
                                                             name='_brian_sign')

randn_code = '''
    #define _randn(vectorisation_idx) (_ptr_array_%CODEOBJ_NAME%_randn[vectorisation_idx])
        '''
DEFAULT_FUNCTIONS['randn'].implementations.add_implementation(CUDACodeGenerator,
                                                              code=randn_code,
                                                              name='_randn')

rand_code = '''
    #define _rand(vectorisation_idx) (_ptr_array_%CODEOBJ_NAME%_rand[vectorisation_idx])
    '''
DEFAULT_FUNCTIONS['rand'].implementations.add_implementation(CUDACodeGenerator,
                                                             code=rand_code,
                                                             name='_rand')

poisson_code = '''
    // Notes on the poisson function implementation:
    //   - Curand generates unsigned int, brian uses int32_t, casting is happening here
    //   - The only codeobject that uses host side random numbers is
    //     synapses_create_generator.cu. There, a C++ poisson function (same as used
    //     in C++ Standalone) is implemented and used via _host_poisson(...)
    //   - For synapses_create_generator.cu tempaltes, the _poisson calls are unchanged
    //     (no regex substitution) and hence just use the C++ implementation
    //   - For device side poisson, we have two cases.
    //       - If the lambda is constant across units of the codeobject (scalar lambda),
    //         we use host side RNG in rand.cu and pass the data pointer to _poisson
    //         (first overloaded definition).
    //       - If the lambda is a variable itself that might be different for each unit
    //         in the codeobject (vectorized lambda), we instead use device side RNG
    //         (device code path in the second overloaded _poisson definition).
    __device__
    int32_t _poisson(unsigned int* _poisson_buffer, int32_t _idx)
    {
        // poisson with constant lambda are generated with curand host API in rand.cu
        return (int32_t) _poisson_buffer[_idx];
    }
    __host__ __device__
    int32_t _poisson(double _lambda, int32_t _idx)
    {
    #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0))
        if (_lambda <= 0)
        {
            return 0;
        }
        else
        {
            // poisson with variable lambda are generated with curand device API on-the-fly
            curandState localState = brian::d_curand_states[_idx];
            unsigned int poisson_number = curand_poisson(&localState, _lambda);
            brian::d_curand_states[_idx] = localState;
            return (int32_t) poisson_number;
        }
    #else
        // use C++ implementation defined in synapses_create_generator.cu
        return _host_poisson(_lambda, _idx);
    #endif
    }
    '''
DEFAULT_FUNCTIONS['poisson'].implementations.add_implementation(
    CUDACodeGenerator,
    code=poisson_code,
    name='_poisson',
    compiler_kwds={"headers": ["<curand.h>"]}
)


# Add support for the `timestep` function added in Brian 2.3.1
timestep_code = '''
    __host__ __device__
    static inline int64_t _timestep(double t, double dt)
    {
        return (int64_t)((t + 1e-3*dt)/dt);
    }
    '''
DEFAULT_FUNCTIONS['timestep'].implementations.add_implementation(CUDACodeGenerator,
                                                                 code=timestep_code,
                                                                 name='_timestep')