Source code for brian2cuda.cuda_prefs

'''
Preferences that relate to the brian2cuda interface.
'''
import numpy as np

from brian2.core.preferences import prefs, BrianPreference
from brian2.core.core_preferences import default_float_dtype_validator, dtype_repr
from brian2.utils.logger import get_logger


logger = get_logger('brian2.devices.cuda_standalone.cuda_prefs')

[docs]def validate_bundle_size_expression(string):
    known_vars = ['mean', 'std', 'max', 'min']
    try:
        # Try formatting all known_vars with 0
        formatted = string.format(**dict(zip(known_vars, [0] * len(known_vars))))
    except KeyError as error:
        logger.error(
            f"Unknown formatting variable {error}. Known variables are:"
            f" {known_vars}"
        )
        return False

    # Replase names from C++ std with numpy version for eval test below
    formatted = formatted.replace("ceil", "np.ceil")
    formatted = formatted.replace("floor", "np.floor")

    try:
        eval(formatted)
    except Exception:
        logger.error(f"Can't evaluate expression '{string}'")
        return False

    return True


# Preferences
prefs.register_preferences(
    'devices.cuda_standalone',
    'Brian2CUDA preferences',

    SM_multiplier = BrianPreference(
        default=1,
        docs='The number of blocks per SM. By default, this value is set to 1.',
        ),

    parallel_blocks = BrianPreference(
        docs='''The total number of parallel blocks to use. If ``None``, the number of parallel
                blocks equals the number streaming multiprocessors on the GPU.''',
        validator=lambda v: v is None or (isinstance(v, int) and v > 0),
        default=1),

    launch_bounds=BrianPreference(
        docs='Wether or not to use ``__launch_bounds__`` to optimise register usage in kernels.',
        default=False),

    syn_launch_bounds=BrianPreference(
        docs='Wether or not to use ``__launch_bounds__`` in synapses and synapses_push to optimise register usage in kernels.',
        default=False),

    calc_occupancy=BrianPreference(
        docs='Wether or not to use cuda occupancy api to choose num_threads and num_blocks.',
        default=True),

    extra_threshold_kernel=BrianPreference(
        docs='Wether or not to use a extra threshold kernel for resetting.',
        default=True),

    random_number_generator_type=BrianPreference(
        docs='''Generator type (str) that cuRAND uses for random number generation.
            Setting the generator type automatically resets the generator ordering
            (prefs.devices.cuda_standalone.random_number_generator_ordering) to its default value.
            See cuRAND documentation for more details on generator types and orderings.''',
        validator=lambda v: v in ['CURAND_RNG_PSEUDO_DEFAULT',
                                  'CURAND_RNG_PSEUDO_XORWOW',
                                  'CURAND_RNG_PSEUDO_MRG32K3A',
                                  'CURAND_RNG_PSEUDO_MTGP32',
                                  'CURAND_RNG_PSEUDO_PHILOX4_32_10',
                                  'CURAND_RNG_PSEUDO_MT19937',
                                  'CURAND_RNG_QUASI_DEFAULT',
                                  'CURAND_RNG_QUASI_SOBOL32',
                                  'CURAND_RNG_QUASI_SCRAMBLED_SOBOL32',
                                  'CURAND_RNG_QUASI_SOBOL64',
                                  'CURAND_RNG_QUASI_SCRAMBLED_SOBOL64'],
        default='CURAND_RNG_PSEUDO_DEFAULT'),

    random_number_generator_ordering=BrianPreference(
        docs='''The ordering parameter (str) used to choose how the results of cuRAND
            random number generation are ordered in global memory.
            See cuRAND documentation for more details on generator types and orderings.''',
        validator=lambda v: not v or v in ['CURAND_ORDERING_PSEUDO_DEFAULT',
                                           'CURAND_ORDERING_PSEUDO_BEST',
                                           'CURAND_ORDERING_PSEUDO_SEEDED',
                                           'CURAND_ORDERING_QUASI_DEFAULT'],
        default=False),  # False will prevent setting ordering in objects.cu (-> curRAND will uset the correct ..._DEFAULT)

    push_synapse_bundles=BrianPreference(
        docs='''If True, synaptic events are propagated by pushing bundles of
        synapse IDs with same delays into the corresponding delay queue. If
        False, each synapse of a spiking neuron is pushed in the corresponding
        queue individually. For very small bundle sizes (number of synapses
        with same delay, connected to a single neuron), pushing single Synapses
        can be faster. This option only has effect for ``Synapses`` objects with
        heterogenous delays.''',
        default=True),

    threads_per_synapse_bundle=BrianPreference(
        docs='''The number of threads used per synapses bundle during effect
        application. This has to be a string, which can be passed to Python's ``eval``
        function. The string can can use ``{mean}``, ``{std}``, ``{max}`` and ``{min}``
        expressions, which refer to the statistics across all bundles, and the function
        'ceil'. The result of this expression will be converted to the next
        lower ``int`` (e.g. ``1.9`` will be cast to ``1.0``). Examples: ``'{mean} + 2 *
        {std}'`` will use the mean bunde size + 2 times the standard deviation over
        bundle sizes and round it to the next lower integer. If you want to round up
        instead, use ``'ceil({mean} + 2 * {std})'``.''',
        default="{max}",
        validator=validate_bundle_size_expression),

    bundle_threads_warp_multiple=BrianPreference(
        docs='''Whether to round the number of threads used per synapse bundle during
        effect application (see
        `devices.cuda_standalone.threads_per_synapse_bundle`) to a multiple of
        the warp size. Round to next multiple if preference is ``'up'``, round to
        previous multiple if ``'low'`` and don't round at all if ``False`` (default). If
        rounding down results in ``0`` threads, ``1`` thread is used instead.''',
        default=False,
        validator=lambda v: v in ['up', 'down', False]),

    no_pre_references=BrianPreference(
        docs='''Set this preference if you don't need access to ``i`` in any
        synaptic code string and no Synapses object applies effects to
        presynaptic variables. This preference is for memory optimization until
        unnecassary device memory allocations in synapse creation are fixed, it
        is only relevant if your network uses close to all memory.''',
        default=False),

    no_post_references=BrianPreference(
        docs='''Set this preference if you don't need access to ``j`` in any
        synaptic code string and no Synapses object applies effects to
        postsynaptic variables. This preference is for memory optimization until
        unnecassary device memory allocations in synapse creation are fixed, it
        is only relevant if your network uses close to all memory.''',
        default=False),

    default_functions_integral_convertion=BrianPreference(
        docs='''The floating point precision to which integral types will be converted when
        passed as arguments to default functions that have no integral type overload in device
        code (sin, cos, tan, sinh, cosh, tanh, exp, log, log10, sqrt, ceil, floor, arcsin, arccos, arctan)."
        NOTE: Convertion from 32bit and 64bit integral types to single precision (32bit) floating-point
        types is not type safe. And convertion from 64bit integral types to double precision (64bit)
        floating-point types neither. In those cases the closest higher or lower (implementation
        defined) representable value will be selected.''',
        validator=default_float_dtype_validator,
        representor=dtype_repr,
        default=np.float64),

    use_atomics=BrianPreference(
        docs='''Weather to try to use atomic operations for synaptic effect
        application. Since this avoids race conditions, effect application can
        be parallelised.''',
        validator=lambda v: isinstance(v, bool),
        default=True),

    profile_statemonitor_copy_to_host=BrianPreference(
        docs='''Profile the final device to host copy of StateMonitor data. This
        preference is used for benchmarking and assumes that there is only one active
        StateMonitor in the network. The parameter of this preference is the recorded
        variable for which the device to host copy is recorded (e.g. 'v').''',
        validator=lambda v: v is None or isinstance(v, str),
        default=None),
)

prefs.register_preferences(
    'devices.cuda_standalone.cuda_backend',
    'Preferences for the CUDA backend in Brian2CUDA',

    gpu_heap_size = BrianPreference(
        docs='''Size of the heap (in MB) used by malloc() and free() device system calls, which
        are used in the ``cudaVector`` implementation. ``cudaVectors`` are used to
        dynamically allocate device memory for ``SpikeMonitors`` and the synapse
        queues in the ``CudaSpikeQueue`` implementation for networks with
        heterogeneously distributed delays.''',
        validator=lambda v: isinstance(v, int) and v >= 0,
        default=128),

    detect_gpus=BrianPreference(
        docs='''Whether to detect names and compute capabilities of all available GPUs.
        This needs access to ``nvidia-smi`` and ``deviceQuery`` binaries.''',
        default=True,
        validator=lambda v: isinstance(v, bool)
    ),

    gpu_id=BrianPreference(
        docs='''The ID of the GPU that should be used for code execution. Default value is
        ``None``, in which case the GPU with the highest compute capability and lowest ID
        is used.

        If environment variable ``CUDA_VISIBLE_DEVICES`` is set, this preference will be
        interpreted as ID from the visible devices (e.g. with ``CUDA_VISIBLE_DEVICES=2``
        and ``gpu_id=0`` preference, the GPU 2 will be used).
        ''',
        default=None,
        validator=lambda v: v is None or isinstance(v, int)
    ),

    extra_compile_args_nvcc=BrianPreference(
        docs='Extra compile arguments (a list of strings) to pass to the nvcc compiler.',
        default=['-w', '-use_fast_math']
    ),

    compute_capability=BrianPreference(
        docs='''Manually set the compute capability for which CUDA code will be
        compiled. Has to be a float (e.g. ``6.1``) or None. If None, compute capability is
        chosen depending on GPU in use. ''',
        validator=lambda v: v is None or isinstance(v, float),
        default=None
    ),

    detect_cuda=BrianPreference(
        docs='''Whether to try to detect CUDA installation paths and version. Disable
        this if you want to generae CUDA standalone code on a system without CUDA
        installed.''',
        default=True,
        validator=lambda v: isinstance(v, bool)
    ),

    cuda_path=BrianPreference(
        docs='''The path to the CUDA installation. If set, this preferences takes
        precedence over environment variable ``CUDA_PATH``.''',
        default=None,
        validator=lambda v: v is None or isinstance(v, str)
    ),

    cuda_runtime_version=BrianPreference(
        docs='The CUDA runtime version.',
        default=None,
        validator=lambda v: v is None or isinstance(v, float)
    ),

    device_query_path=BrianPreference(
        docs='''Path to CUDA's deviceQuery binary. Used to detect a GPUs compute
        capability''',
        default=None,
        validator=lambda v: v is None or isinstance(v, str)
    ),

)