Source code for brian2cuda.device

'''
Module implementing the CUDA "standalone" device.
'''
import os
import inspect
from collections import defaultdict, Counter
import tempfile
from distutils import ccompiler
import re
from itertools import chain

import numpy as np

from brian2.codegen.cpp_prefs import get_compiler_and_args
from brian2.codegen.translation import make_statements
from brian2.core.clocks import Clock, defaultclock
from brian2.core.namespace import get_local_namespace
from brian2.core.preferences import prefs, PreferenceError
from brian2.core.variables import ArrayVariable, DynamicArrayVariable, Constant
from brian2.parsing.rendering import CPPNodeRenderer
from brian2.devices.device import all_devices
from brian2.synapses.synapses import Synapses, SynapticPathway
from brian2.utils.filetools import copy_directory, ensure_directory
from brian2.utils.stringtools import get_identifiers, stripped_deindented_lines
from brian2.codegen.generators.cpp_generator import c_data_type
from brian2.utils.logger import get_logger
from brian2.units import second
from brian2.monitors import SpikeMonitor, StateMonitor, EventMonitor
from brian2.groups import Subgroup

from brian2.devices.cpp_standalone.device import CPPWriter, CPPStandaloneDevice
from brian2.input.spikegeneratorgroup import SpikeGeneratorGroup

from brian2cuda.utils.stringtools import replace_floating_point_literals
from brian2cuda.utils.gputools import select_gpu, get_nvcc_path
from brian2cuda.utils.logger import report_issue_message

from .codeobject import CUDAStandaloneCodeObject, CUDAStandaloneAtomicsCodeObject


__all__ = []

logger = get_logger('brian2.devices.cuda_standalone')


[docs]class CUDAWriter(CPPWriter): def __init__(self, project_dir): self.project_dir = project_dir self.source_files = set() self.header_files = set()
[docs] def write(self, filename, contents): logger.diagnostic(f'Writing file {filename}:\n{contents}') if filename.lower().endswith('.cu'): self.source_files.add(filename) if filename.lower().endswith('.cpp'): self.source_files.add(filename) elif filename.lower().endswith('.h'): self.header_files.add(filename) elif filename.endswith('.*'): self.write(filename[:-1]+'cu', contents.cu_file) self.write(filename[:-1]+'h', contents.h_file) return fullfilename = os.path.join(self.project_dir, filename) if os.path.exists(fullfilename): if open(fullfilename, 'r').read()==contents: return open(fullfilename, 'w').write(contents)
[docs]class CUDAStandaloneDevice(CPPStandaloneDevice): ''' The `Device` used for CUDA standalone simulations. ''' def __init__(self): super(CUDAStandaloneDevice, self).__init__() ### Reset variables we don't need from CPPStandaloneDevice.__init__() # remove randomkit, which we don't use for CUDA Standalone self.include_dirs.remove('brianlib/randomkit') self.library_dirs.remove('brianlib/randomkit') # Add code line slots used in our benchmarks # TODO: Add to brian2 and remove here self.code_lines.update({'before_network_run': [], 'after_network_run': []}) ### Attributes specific to CUDAStandaloneDevice: # only true during first run call (relevant for synaptic pre/post ID deletion) self.first_run = True # the minimal supported GPU compute capability self.minimal_compute_capability = 3.5 # store the ID of the used GPU and it's compute capability self.gpu_id = None self.compute_capability = None # list of pre/post ID and delay arrays that are not needed in device memory self.delete_synaptic_pre = {} self.delete_synaptic_post = {} self.delete_synaptic_delay = {} # The following nested dictionary collects all codeobjects that use random # number generation (RNG). self.codeobjects_with_rng = { # All codeobjects that use the curand device api (binomial and # poisson with vectorized lambda) "device_api": { # This collects all codeobjects that run evey cycle of a clock "every_tick": [], # This collects all codeobjects that are running only once "single_tick": [] }, # All codeobjects that use the curand host api (rand, randn and poisson with # scalar lambda) "host_api": { # Dictionary of lists of codeobjects. Dictionary keys are the RNG types # (rand, randn, poisson_<idx>). For each `poisson(lamda)` function with # a different scalar lamda per codeobject, a new `poisson_<idx>` key is # added to the defaultdict. The dictionary values are lists of # codeobject. "all_runs": defaultdict(list), # Lists of defaultdict(list) with same structure as "all_runs", but now # codeobjects are seperate by `brian2.run()` calls (one list item per # run). "per_run": [] }, } # Dictionary to look up `lambda` values for all `poisson` calls with scalar # `lambda`, sorted by codeobj.name and poisson_name (`poisson-<idx>`): # all_poisson_lamdas[codeobj.name][poisson_name] = lamda self.all_poisson_lamdas = defaultdict(dict) # List of multisynaptic index variables (for all Synapses with multisynaptic # index) self.multisyn_vars = [] # List of names of all variables which are only required on host and will not # be copied to device memory self.variables_on_host_only = []
[docs] def get_array_name(self, var, access_data=True, prefix=None): ''' Return a globally unique name for `var`. Parameters ---------- access_data : bool, optional For `DynamicArrayVariable` objects, specifying `True` here means the name for the underlying data is returned. If specifying `False`, the name of object itself is returned (e.g. to allow resizing). prefix: {'_ptr', 'dev', 'd'}, optional Prefix for array name. Host pointers to device memory are prefixed with `dev`, device pointers to device memory are prefixed with `d` and pointers used in `scalar_code` and `vector_code` are prefixed with `_ptr` (independent of whether they are used in host or device code). The `_ptr` variables are declared as parameters in the kernel definition (KERNEL_PARAMETERS). ''' # In single-precision mode we replace dt variables in codeobjects with # a single precision version, for details see #148 if hasattr(var, 'real_var'): return self.get_array_name(var.real_var, access_data=access_data, prefix=prefix) prefix = prefix or '' choices = ['_ptr', 'dev', 'd', ''] if prefix not in choices: raise ValueError( f"`prefix` has to be one of {choices} or `None`, got {prefix}" ) if not access_data and prefix in ['_ptr', 'd']: raise ValueError( f"Don't use `'{prefix}'` prefix for a dynamic array object." ) array_name = '' if isinstance(var, DynamicArrayVariable): if access_data: array_name = self.arrays[var] elif var.ndim == 1: array_name = self.dynamic_arrays[var] else: array_name = self.dynamic_arrays_2d[var] elif isinstance(var, ArrayVariable): array_name = self.arrays[var] else: raise TypeError(('Do not have a name for variable of type ' '%s') % type(var)) return prefix + array_name
[docs] def code_object_class(self, codeobj_class=None, fallback_pref=None): ''' Return `CodeObject` class (either `CUDAStandaloneCodeObject` class or input) Parameters ---------- codeobj_class : a `CodeObject` class, optional If this is keyword is set to None or no arguments are given, this method will return the default (`CUDAStandaloneCodeObject` class). fallback_pref : str, optional For the cuda_standalone device this option is ignored. Returns ------- codeobj_class : class The `CodeObject` class that should be used ''' # Ignore the requested pref (used for optimization in runtime) if codeobj_class is None: return CUDAStandaloneCodeObject else: return codeobj_class
[docs] def get_array_read_write(self, abstract_code, variables): ################################################################## # This code is copied from CodeGenerator.translate() and # CodeGenerator.array_read_write() and should give us a set of # variables to which will be written or from which will be read in # `vector_code` ################################################################## vector_statements = {} for ac_name, ac_code in abstract_code.items(): statements = make_statements(ac_code, variables, prefs['core.default_float_dtype'], optimise=True, blockname=ac_name) _, vector_statements[ac_name] = statements read = set() write = set() for statements in vector_statements.values(): for stmt in statements: ids = get_identifiers(stmt.expr) # if the operation is inplace this counts as a read. if stmt.inplace: ids.add(stmt.var) read = read.union(ids) write.add(stmt.var) read = set(varname for varname, var in variables.items() if isinstance(var, ArrayVariable) and varname in read) write = set(varname for varname, var in variables.items() if isinstance(var, ArrayVariable) and varname in write) return read, write
[docs] def code_object(self, owner, name, abstract_code, variables, template_name, variable_indices, codeobj_class=None, template_kwds=None, override_conditional_write=None, compiler_kwds=None): if prefs['core.default_float_dtype'] == np.float32 and 'dt' in variables: # In single-precision mode we replace dt variables in codeobjects with # a single precision version, for details see #148 dt_var = variables['dt'] new_dt_var = ArrayVariable(dt_var.name, dt_var.owner, dt_var.size, dt_var.device, dimensions=dt_var.dim, dtype=np.float32, constant=dt_var.constant, scalar=dt_var.scalar, read_only=dt_var.read_only, dynamic=dt_var.dynamic, unique=dt_var.unique) new_dt_var.real_var = dt_var new_dt_var.set_value(dt_var.get_value().item()) variables['dt'] = new_dt_var if template_kwds is None: template_kwds = dict() else: template_kwds = dict(template_kwds) template_kwds['profiled'] = self.enable_profiling template_kwds['bundle_mode'] = prefs["devices.cuda_standalone.push_synapse_bundles"] no_or_const_delay_mode = False if isinstance(owner, (SynapticPathway, Synapses)) and "delay" in owner.variables and owner.variables["delay"].scalar: # catches Synapses(..., delay=...) syntax, does not catch the case when no delay is specified at all no_or_const_delay_mode = True template_kwds["no_or_const_delay_mode"] = no_or_const_delay_mode # Check if pre/post IDs are needed per synapse # This is the case when presynapic/postsynapitc variables are used in synapses code or # if they are used to set synaptic variables after the first run call. # If in at least one `synapses` template for the same Synapses # object or in a `run_regularly` call (creates a `statupdate`) a # pre/post IDs are needed, we don't delete them. And if a synapses object that is only # run once after the first run call (e.g. syn.w['i<j'] = ...), we don't delete either. # If deleted, they will be deleted on the device in `run_lines` (see below) synapses_object_every_tick = False synapses_object_single_tick_after_run = False group_variable_set_templates = ['group_variable_set_conditional', 'group_variable_set'] if isinstance(owner, Synapses): if template_name in ['synapses', 'stateupdate', 'summed_variable']: synapses_object_every_tick = True if not self.first_run and template_name in group_variable_set_templates: synapses_object_single_tick_after_run = True if synapses_object_every_tick or synapses_object_single_tick_after_run: read, write = self.get_array_read_write(abstract_code, variables) read_write = read.union(write) synaptic_pre_array_name = self.get_array_name( owner.variables['_synaptic_pre'], access_data=False ) synaptic_post_array_name = self.get_array_name( owner.variables['_synaptic_post'], access_data=False ) if synaptic_pre_array_name not in self.delete_synaptic_pre.keys(): self.delete_synaptic_pre[synaptic_pre_array_name] = True if synaptic_post_array_name not in self.delete_synaptic_post.keys(): self.delete_synaptic_post[synaptic_post_array_name] = True error_msg = ("'devices.cuda_standalone.no_{prepost}_references' " "was set to True, but {prepost}synaptic index is " "needed for variable {varname} in {owner.name}") # Check for all variable that are read or written to if they are # i/j or their indices are pre/post for varname in variables.keys(): if varname in read_write: idx = variable_indices[varname] if idx == '_presynaptic_idx' or varname == 'i': self.delete_synaptic_pre[synaptic_pre_array_name] = False if prefs['devices.cuda_standalone.no_pre_references']: raise PreferenceError(error_msg.format(prepost='pre', varname=varname, owner=owner)) if idx == '_postsynaptic_idx' or varname == 'j': self.delete_synaptic_post[synaptic_post_array_name] = False if prefs['devices.cuda_standalone.no_post_references']: raise PreferenceError(error_msg.format(prepost='post', varname=varname, owner=owner)) # Summed variables need the indices of their target variable, which # are not in the read_write set. if template_name == 'summed_variable': idx = template_kwds['_index_var'].name varname = template_kwds['_target_var'].name if idx == '_synaptic_pre': self.delete_synaptic_pre[synaptic_pre_array_name] = False if prefs['devices.cuda_standalone.no_pre_references']: raise PreferenceError(error_msg.format(prepost='pre', varname=varname, owner=owner)) if idx == '_synaptic_post': self.delete_synaptic_post[synaptic_post_array_name] = False if prefs['devices.cuda_standalone.no_post_references']: raise PreferenceError(error_msg.format(prepost='post', varname=varname, owner=owner)) if idx == '_syaptic_post': self.delete_synaptic_post[synaptic_post_array_name] = False # Collect all variables written to in group_variable_set_templates so they can # be copied from device to host after being changed on device if template_name in group_variable_set_templates: read, write = self.get_array_read_write(abstract_code, variables) written_variables = {} for variable_name in write: var = variables[variable_name] varname = self.get_array_name(var, access_data=False) written_variables[var] = varname template_kwds['written_variables'] = written_variables # Do a similar check for the delay variables as for i/j. Delays can't be # modified in Synapses model code, so we only care about single tick objects # (setting delay values) if ( not self.first_run and 'delay' in owner.variables and isinstance(owner, SynapticPathway) ): read_write = read.union(write) varname = owner.variables['delay'].name if varname in read_write: synaptic_delay_array_name = self.get_array_name( owner.variables['delay'], access_data=False ) self.delete_synaptic_delay[synaptic_delay_array_name] = False if template_name == "synapses": prepost = template_kwds['pathway'].prepost synaptic_effects = "synapse" for varname in variables.keys(): if varname in write: idx = variable_indices[varname] if (prepost == 'pre' and idx == '_postsynaptic_idx') or (prepost == 'post' and idx == '_presynaptic_idx'): # The SynapticPathways 'target' group variables are modified if synaptic_effects == "synapse": synaptic_effects = "target" if (prepost == 'pre' and idx == '_presynaptic_idx') or (prepost == 'post' and idx == '_postsynaptic_idx'): # The SynapticPathways 'source' group variables are modified synaptic_effects = "source" template_kwds["synaptic_effects"] = synaptic_effects logger.debug(f"Synaptic effects of Synapses object {name} modify {synaptic_effects} group variables.") # use atomics if possible (except for `synapses` mode, where we cann parallelise without) # TODO: this overwrites if somebody sets a codeobject in the Synapses(..., codeobj_class=...) if prefs['devices.cuda_standalone.use_atomics'] and synaptic_effects != 'synapses': codeobj_class = CUDAStandaloneAtomicsCodeObject logger.debug( f"Using atomics in synaptic effect application of Synapses object " f"{name}" ) threads_expr = prefs.devices.cuda_standalone.threads_per_synapse_bundle pathway_name = template_kwds['pathway'].name replace_expr = { '{mean}': f'{pathway_name}_bundle_size_mean', '{std}': f'{pathway_name}_bundle_size_std', '{min}': f'{pathway_name}_bundle_size_min', '{max}': f'{pathway_name}_bundle_size_max', } for old, new in replace_expr.items(): threads_expr = threads_expr.replace(old, new) template_kwds["threads_per_synapse_bundle"] = threads_expr template_kwds["bundle_threads_warp_multiple"] = prefs.devices.cuda_standalone.bundle_threads_warp_multiple if template_name in ["synapses_create_generator", "synapses_create_array"]: if owner.multisynaptic_index is not None: template_kwds["multisynaptic_idx_var"] = owner.variables[owner.multisynaptic_index] template_kwds["no_pre_references"] = False template_kwds["no_post_references"] = False if prefs['devices.cuda_standalone.no_pre_references']: template_kwds["no_pre_references"] = True if prefs['devices.cuda_standalone.no_post_references']: template_kwds["no_post_references"] = True template_kwds["launch_bounds"] = prefs["devices.cuda_standalone.launch_bounds"] template_kwds["sm_multiplier"] = prefs["devices.cuda_standalone.SM_multiplier"] template_kwds["syn_launch_bounds"] = prefs["devices.cuda_standalone.syn_launch_bounds"] template_kwds["calc_occupancy"] = prefs["devices.cuda_standalone.calc_occupancy"] if template_name in ["threshold", "spikegenerator"]: template_kwds["extra_threshold_kernel"] = prefs["devices.cuda_standalone.extra_threshold_kernel"] if template_name == "spikemonitor" and '_source_start' in variables: # Provide subgroup start/stop as template keywords for hardcoding into is_in_subgroup template_kwds.update({'_source_start': variables['_source_start'].get_value(), '_source_stop': variables['_source_stop'].get_value()}) codeobj = super(CUDAStandaloneDevice, self).code_object(owner, name, abstract_code, variables, template_name, variable_indices, codeobj_class=codeobj_class, template_kwds=template_kwds, override_conditional_write=override_conditional_write, compiler_kwds=compiler_kwds, ) return codeobj
[docs] def check_openmp_compatible(self, nb_threads): if nb_threads > 0: raise NotImplementedError("Using OpenMP in a CUDA standalone project is not supported")
[docs] def generate_objects_source(self, writer, arange_arrays, synapses, static_array_specs, networks): sm_multiplier = prefs.devices.cuda_standalone.SM_multiplier num_parallel_blocks = prefs.devices.cuda_standalone.parallel_blocks curand_generator_type = prefs.devices.cuda_standalone.random_number_generator_type curand_generator_ordering = prefs.devices.cuda_standalone.random_number_generator_ordering self.eventspace_arrays = {} self.spikegenerator_eventspaces = [] for var, varname in self.arrays.items(): if var.name.endswith('space'): # get all eventspace variables self.eventspace_arrays[var] = varname #if hasattr(var, 'owner') and isinstance(v.owner, Clock): if isinstance(var.owner, SpikeGeneratorGroup): self.spikegenerator_eventspaces.append(varname) for var in self.eventspace_arrays.keys(): del self.arrays[var] subgroups_with_spikemonitor = set() for codeobj in self.code_objects.values(): if isinstance(codeobj.owner, SpikeMonitor): if isinstance(codeobj.owner.source, Subgroup): subgroups_with_spikemonitor.add(codeobj.owner.source.name) profile_statemonitor_copy_to_host = prefs.devices.cuda_standalone.profile_statemonitor_copy_to_host profile_statemonitor_vars = [] for var in self.dynamic_arrays_2d.keys(): is_statemon = isinstance(var.owner, StateMonitor) if (profile_statemonitor_copy_to_host and isinstance(var.owner, StateMonitor) and var.name == profile_statemonitor_copy_to_host): profile_statemonitor_vars.append(var) arr_tmp = self.code_object_class().templater.objects( None, None, array_specs=self.arrays, dynamic_array_specs=self.dynamic_arrays, dynamic_array_2d_specs=self.dynamic_arrays_2d, zero_arrays=self.zero_arrays, arange_arrays=arange_arrays, synapses=synapses, clocks=self.clocks, static_array_specs=static_array_specs, networks=networks, code_objects=self.code_objects.values(), get_array_filename=self.get_array_filename, all_codeobj_with_host_rng=self.codeobjects_with_rng["host_api"]["all_runs"], sm_multiplier=sm_multiplier, num_parallel_blocks=num_parallel_blocks, curand_generator_type=curand_generator_type, curand_generator_ordering=curand_generator_ordering, curand_float_type=c_data_type(prefs['core.default_float_dtype']), eventspace_arrays=self.eventspace_arrays, spikegenerator_eventspaces=self.spikegenerator_eventspaces, multisynaptic_idx_vars=self.multisyn_vars, profiled_codeobjects=self.profiled_codeobjects, profile_statemonitor_copy_to_host=profile_statemonitor_copy_to_host, profile_statemonitor_vars=profile_statemonitor_vars, subgroups_with_spikemonitor=sorted(subgroups_with_spikemonitor), variables_on_host_only=self.variables_on_host_only) # Reinsert deleted entries, in case we use self.arrays later? maybe unnecassary... self.arrays.update(self.eventspace_arrays) writer.write('objects.*', arr_tmp)
[docs] def generate_main_source(self, writer): main_lines = [] procedures = [('', main_lines)] runfuncs = {} run_counter = 0 for func, args in self.main_queue: if func=='before_run_code_object': codeobj, = args main_lines.append('_before_run_%s();' % codeobj.name) elif func=='run_code_object': codeobj, = args codeobj.runs_every_tick = False # Need to check for RNG functions in code objects only run once (e.g. # when setting group variables before run). The results are stored in # `code_object.rng_calls`, `code_object.poisson_lamdas` and # `code_object.needs_curand_states`. prepare_codeobj_code_for_rng(codeobj) if codeobj.needs_curand_states: self.codeobjects_with_rng["device_api"]["single_tick"].append(codeobj) if isinstance(codeobj.owner, Synapses) \ and codeobj.template_name in ['group_variable_set_conditional', 'group_variable_set']: # At curand state initalization, synapses are not generated yet. # For codeobjects run every tick, this happens in the init() of # the random number buffer called at first clock cycle of the network main_lines.append('random_number_buffer.ensure_enough_curand_states();') main_lines.append(f'_run_{codeobj.name}();') elif func == 'after_run_code_object': codeobj, = args main_lines.append(f'_after_run_{codeobj.name}();') elif func=='run_network': net, netcode = args if run_counter == 0: # These lines delete `i`/`j`/`delay` variables stored per synapse. # They need to be called after before_run_synapses_push_spikes # codeobjects, therefore just before the network creation, so I put # them here # TODO: Move this into before_run_synapses_push_spikes for synaptic_pre, delete in self.delete_synaptic_pre.items(): if delete: code = f''' dev{synaptic_pre}.clear(); dev{synaptic_pre}.shrink_to_fit(); ''' main_lines.extend(stripped_deindented_lines(code)) for synaptic_post, delete in self.delete_synaptic_post.items(): if delete: code = f''' dev{synaptic_post}.clear(); dev{synaptic_post}.shrink_to_fit(); ''' main_lines.extend(stripped_deindented_lines(code)) for synaptic_delay, delete in self.delete_synaptic_delay.items(): if delete: code = f''' dev{synaptic_delay}.clear(); dev{synaptic_delay}.shrink_to_fit(); ''' main_lines.extend(stripped_deindented_lines(code)) # The actual network code main_lines.extend(netcode) # Increment run counter run_counter += 1 elif func=='set_by_constant': arrayname, value, is_dynamic = args size_str = f"{arrayname}.size()" if is_dynamic else f"_num_{arrayname}" rendered_value = CPPNodeRenderer().render_expr(repr(value)) pointer_arrayname = f"dev{arrayname}" if arrayname.endswith('space'): # eventspace pointer_arrayname += f'[current_idx{arrayname}]' if is_dynamic: pointer_arrayname = f"thrust::raw_pointer_cast(&dev{arrayname}[0])" # Set on host code = f''' for(int i=0; i<{size_str}; i++) {{ {arrayname}[i] = {rendered_value}; }} ''' if arrayname not in self.variables_on_host_only: # Copy to device code += f''' CUDA_SAFE_CALL( cudaMemcpy( {pointer_arrayname}, &{arrayname}[0], sizeof({arrayname}[0])*{size_str}, cudaMemcpyHostToDevice ) ); ''' main_lines.extend(stripped_deindented_lines(code)) elif func=='set_by_single_value': arrayname, item, value = args pointer_arrayname = f"dev{arrayname}" if arrayname.endswith('space'): # eventspace pointer_arrayname += f'[current_idx{arrayname}]' if arrayname in self.dynamic_arrays.values(): pointer_arrayname = f"thrust::raw_pointer_cast(&dev{arrayname}[0])" # Set on host code = f"{arrayname}[{item}] = {value};" if arrayname not in self.variables_on_host_only: # Copy to device code += f''' CUDA_SAFE_CALL( cudaMemcpy( {pointer_arrayname} + {item}, &{arrayname}[{item}], sizeof({arrayname}[{item}]), cudaMemcpyHostToDevice ) ); ''' main_lines.extend(stripped_deindented_lines(code)) elif func=='set_by_array': arrayname, staticarrayname, is_dynamic = args size_str = "_num_" + arrayname if is_dynamic: size_str = arrayname + ".size()" pointer_arrayname = f"dev{arrayname}" if arrayname in self.dynamic_arrays.values(): pointer_arrayname = f"thrust::raw_pointer_cast(&dev{arrayname}[0])" # Set on host code = f''' for(int i=0; i<_num_{staticarrayname}; i++) {{ {arrayname}[i] = {staticarrayname}[i]; }} ''' if arrayname not in self.variables_on_host_only: # Copy to device code += f''' CUDA_SAFE_CALL( cudaMemcpy( {pointer_arrayname}, &{arrayname}[0], sizeof({arrayname}[0])*{size_str}, cudaMemcpyHostToDevice ) ); ''' main_lines.extend(stripped_deindented_lines(code)) elif func=='set_array_by_array': arrayname, staticarrayname_index, staticarrayname_value = args # Set on host code = f''' for(int i=0; i<_num_{staticarrayname_index}; i++) {{ {arrayname}[{staticarrayname_index}[i]] = {staticarrayname_value}[i]; }} ''' if arrayname not in self.variables_on_host_only: # Copy to device code += f''' CUDA_SAFE_CALL( cudaMemcpy( dev{arrayname}, &{arrayname}[0], sizeof({arrayname}[0])*_num_{arrayname}, cudaMemcpyHostToDevice ) ); ''' main_lines.extend(stripped_deindented_lines(code)) elif func=='resize_array': array_name, new_size = args code = f''' {array_name}.resize({new_size}); THRUST_CHECK_ERROR(dev{array_name}.resize({new_size})); ''' main_lines.extend(stripped_deindented_lines(code)) elif func=='insert_code': main_lines.append(args) elif func=='start_run_func': name, include_in_parent = args if include_in_parent: main_lines.append(f'{name}();') main_lines = [] procedures.append((name, main_lines)) elif func=='end_run_func': name, include_in_parent = args name, main_lines = procedures.pop(-1) runfuncs[name] = main_lines name, main_lines = procedures[-1] elif func=='seed': seed = args if seed is None: # draw random seed in range of possible uint64 numbers seed = np.random.randint(np.iinfo(np.uint64).max, dtype=np.uint64) main_lines.append(f'random_number_buffer.set_seed({seed!r}ULL);') else: raise NotImplementedError("Unknown main queue function type "+func) # Store the GPU ID and it's compute capability. The latter can be overwritten in # self.generate_makefile() via preferences self.gpu_id, self.compute_capability = select_gpu() # generate the finalisations for codeobj in self.code_objects.values(): if hasattr(codeobj.code, 'main_finalise'): main_lines.append(codeobj.code.main_finalise) user_headers = self.headers + prefs['codegen.cpp.headers'] main_tmp = self.code_object_class().templater.main(None, None, gpu_id=self.gpu_id, main_lines=main_lines, code_lines=self.code_lines, code_objects=self.code_objects.values(), report_func=self.report_func, dt=float(defaultclock.dt), user_headers=user_headers, gpu_heap_size=prefs['devices.cuda_standalone.cuda_backend.gpu_heap_size'] ) writer.write('main.cu', main_tmp)
[docs] def generate_codeobj_source(self, writer): code_object_defs = defaultdict(list) host_parameters = defaultdict(list) kernel_parameters = defaultdict(list) kernel_constants = defaultdict(list) additional_host_code = defaultdict(list) c_float_dtype = c_data_type(prefs['core.default_float_dtype']) c_int_dtype = 'unsigned int' # Generate data for non-constant values for codeobj in self.code_objects.values(): code_object_defs_lines = [] code_object_defs_lines_host_only = [] host_parameters_lines = [] kernel_parameters_lines = [] kernel_constants_lines = [] additional_host_code_lines = [] number_elements = "" if hasattr(codeobj, 'owner') and hasattr(codeobj.owner, '_N') and codeobj.owner._N != 0: number_elements = str(codeobj.owner._N) else: number_elements = "_N" # We need the functions to be sorted by keys for reproducable rng with a # given seed: For codeobjects that are only run once, we generate the random # numbers using the curand host API. For that, we insert code into the # `additional_code` block in the host function. If we use multiple random # function in one codeobject (e.g. rand() and randn()), the order in which # they are generated can differ between two codeobjects, which makes the # brian2.tests.test_neurongroup.test_random_values_fixed_seed fail. for k, v in sorted(codeobj.variables.items()): if k == 'dt' and prefs['core.default_float_dtype'] == np.float32: # use the double-precision array versions for dt as kernel arguments # they are cast to single-precision scalar dt in scalar_code v = v.real_var # code objects which only run once if k in ["rand", "randn", "poisson"] and codeobj.runs_every_tick == False and codeobj.template_name != "synapses_create_generator": curand_suffix = '' if prefs['core.default_float_dtype'] == np.float64: curand_suffix = 'Double' if k == "randn": num_calls = codeobj.rng_calls["randn"] code = f''' // Genenerate an array of random numbers on the device // Make sure we generate an even number of random numbers int32_t _randn_N = ({number_elements} % 2 == 0) ? {number_elements} : {number_elements} + 1; {c_float_dtype}* dev_array_randn; CUDA_SAFE_CALL( cudaMalloc( (void**)&dev_array_randn, sizeof({c_float_dtype})*_randn_N*{num_calls} ) ); CUDA_SAFE_CALL( curandGenerateNormal{curand_suffix}( curand_generator, dev_array_randn, _randn_N*{num_calls}, 0, // mean 1 // stddev ) ); ''' additional_host_code_lines.append(code) kernel_parameters_lines.append( f"{c_float_dtype}* _ptr_array_{codeobj.name}_randn" ) host_parameters_lines.append("dev_array_randn") elif k == "rand": num_calls = codeobj.rng_calls["rand"] code = f''' // Genenerate an array of random numbers on the device // Make sure we generate an even number of random numbers int32_t _rand_N = ({number_elements} % 2 == 0) ? {number_elements} : {number_elements} + 1; {c_float_dtype}* dev_array_rand; CUDA_SAFE_CALL( cudaMalloc( (void**)&dev_array_rand, sizeof({c_float_dtype})*_rand_N*{num_calls} ) ); CUDA_SAFE_CALL( curandGenerateUniform{curand_suffix}( curand_generator, dev_array_rand, _rand_N*{num_calls} ) ); ''' additional_host_code_lines.append(code) kernel_parameters_lines.append( f"{c_float_dtype}* _ptr_array_{codeobj.name}_rand" ) host_parameters_lines.append("dev_array_rand") elif k == "poisson": # We are assuming that there can be at most one poisson call per expression, # else brian2 should raise a NotImplementedError due to multiple stateful function calls. assert len(codeobj.poisson_lamdas) < 2, report_issue_message if len(codeobj.poisson_lamdas) == 0: ### On-the-fly poisson number generation (curand device API) # If we have a poisson function call and no entry in # `poisson_lamdas`, we must have a variable lamda and are # using on-the-fly RNG We don't need to add any code, we # will use the device implementation defined in # cuda_generator.py assert codeobj.needs_curand_states, report_issue_message else: # len(codeobj.poisson_lamdas) == 1 ### Pregenerated poisson number (curand host API) # There only one poisson call, hence we have only `poisson_0` poisson_name = 'poisson_0' # curand generates `unsigned int`, we cast it to `int32_t` in our `_poisson` implementation num_calls = codeobj.rng_calls[poisson_name] lamda = codeobj.poisson_lamdas[poisson_name] code = f''' // Genenerate an array of random numbers on the device // Make sure we generate an even number of random numbers int32_t _{poisson_name}_N = ({number_elements} % 2 == 0) ? {number_elements} : {number_elements} + 1; {c_int_dtype}* dev_array_{poisson_name}; CUDA_SAFE_CALL( cudaMalloc( (void**)&dev_array_{poisson_name}, sizeof(unsigned int)*_{poisson_name}_N*{num_calls} ) ); CUDA_SAFE_CALL( curandGeneratePoisson( curand_generator, dev_array_{poisson_name}, _{poisson_name}_N*{num_calls}, {lamda} ) ); ''' additional_host_code_lines.append(code) kernel_parameters_lines.append( f"{c_int_dtype}* _ptr_array_{codeobj.name}_{poisson_name}" ) host_parameters_lines.append(f"dev_array_{poisson_name}") # Clock variables (t, dt, timestep) elif hasattr(v, 'owner') and isinstance(v.owner, Clock): # Clocks only run on the host and the corresponding device variables are copied # to the device only once in the beginning and in the end of a simulation. # Therefore, we pass clock variables (t, dt, timestep) by value as kernel # parameters whenever they are needed on the device. These values are translated # into pointers in CUDACodeGenerator.determine_keywords(), such that they can be # used in scalar/vector code. arrayname = self.get_array_name(v) dtype = c_data_type(v.dtype) host_parameters_lines.append( f"{arrayname}[0]") kernel_parameters_lines.append( f"const {dtype} _value{arrayname}") # ArrayVariables (dynamic and not) elif isinstance(v, ArrayVariable): # These templates run on the host host_codeobjects = ['synapses_create_generator', 'synapses_create_array'] # These templates run on host and device (e.g. synapses_push_spikes # has a before_run codeobject that runs on host only while the run # codeobject runs on the device) host_and_device_codeobjects = ['synapses_push_spikes', 'spatialstateupdate'] prefixes = ['dev'] if codeobj.template_name in host_codeobjects: prefixes = [''] elif codeobj.template_name in host_and_device_codeobjects: # Start with no prefix, such that `_num{array}` variables are # determined from the host vector if array is dynamic prefixes = ['', 'dev'] for n_prefix, prefix in enumerate(prefixes): try: dyn_array_name = self.get_array_name(v, access_data=False, prefix=prefix) array_name = self.get_array_name(v, access_data=True, prefix=prefix) ptr_array_name = self.get_array_name(v, access_data=True, prefix='_ptr') dtype = c_data_type(v.dtype) if isinstance(v, DynamicArrayVariable): if v.ndim == 1: code_object_defs_lines.append( f'{dtype}* const {array_name} = thrust::raw_pointer_cast(&{dyn_array_name}[0]);' ) # Add host and kernel parameters only for device pointers if prefix == 'dev': # These lines are used to define the kernel call parameters, that # means only for codeobjects running on the device. The array names # always have a `_dev` prefix. host_parameters_lines.append(f"{array_name}") # These lines declare kernel parameters as the `_ptr` variables that # are used in `scalar_code` and `vector_code`. # TODO: here we should add const / __restrict and other optimizations # for variables that are e.g. only read in the kernel kernel_parameters_lines.append(f"{dtype}* {ptr_array_name}") # Add size variables `_num{array}` only once and if # there are two prefixes, base it on host array # `{array}.size()` if len(prefixes) == 1 or prefix == '': code_object_defs_lines.append( f'const int _num{k} = {dyn_array_name}.size();' ) host_parameters_lines.append(f"_num{k}") kernel_parameters_lines.append(f"const int _num{k}") else: # v is ArrayVariable but not DynamicArrayVariable # Add host and kernel parameters only for device pointers if prefix == 'dev': idx = '' if k.endswith('space'): bare_array_name = self.get_array_name(v) idx = f'[current_idx{bare_array_name}]' host_parameters_lines.append(f"{array_name}{idx}") kernel_parameters_lines.append(f'{dtype}* {ptr_array_name}') # Add size variables `_num{array}` only once if n_prefix == 0: line = f'const int _num{k} = {v.size};' code_object_defs_lines.append(line) kernel_constants_lines.append(line) except TypeError: pass # Constant variables elif isinstance(v, Constant): rendered_value = CPPNodeRenderer().render_expr(repr(v.value)) c_type = c_data_type(v.dtype) line = f'const {c_type} {k} = {rendered_value};' code_object_defs_lines.append(line) kernel_constants_lines.append(line) # This rand stuff got a little messy... we pass a device pointer as kernel variable and have a hash define for rand() -> _ptr_..._rand[] # The device pointer is advanced every clock cycle in rand.cu and reset when the random number buffer is refilled (also in rand.cu) # TODO can we just include this in the k == 'rand' test above? # RAND if codeobj.rng_calls["rand"] >= 1 and codeobj.runs_every_tick: host_parameters_lines.append(f"dev_{codeobj.name}_rand") kernel_parameters_lines.append( f"{c_float_dtype}* _ptr_array_{codeobj.name}_rand" ) # RANDN if codeobj.rng_calls["randn"] >= 1 and codeobj.runs_every_tick: host_parameters_lines.append(f"dev_{codeobj.name}_randn") kernel_parameters_lines.append( f"{c_float_dtype}* _ptr_array_{codeobj.name}_randn" ) # POISSON (with scalar lamda) # Here, we don't use the hash define as for rand/n, instead we pass the # kernel paramter (_ptr...) directly to the _poisson function which returns # the correct element # TODO: We could do the same for rand/n and get rid of the hash define hack for rng_type in codeobj.rng_calls.keys(): if rng_type not in ["rand", "randn"] and codeobj.runs_every_tick: assert rng_type.startswith("poisson") if codeobj.rng_calls[rng_type] >= 1: host_parameters_lines.append( f"dev_{codeobj.name}_{rng_type}" ) kernel_parameters_lines.append( f"{c_int_dtype}* _ptr_array_{codeobj.name}_{rng_type}" ) # Sometimes an array is referred to by to different keys in our # dictionary -- make sure to never add a line twice for line in code_object_defs_lines: if not line in code_object_defs[codeobj.name]: code_object_defs[codeobj.name].append(line) for line in host_parameters_lines: if not line in host_parameters[codeobj.name]: host_parameters[codeobj.name].append(line) for line in kernel_parameters_lines: if not line in kernel_parameters[codeobj.name]: kernel_parameters[codeobj.name].append(line) for line in chain(kernel_constants_lines): if not line in kernel_constants[codeobj.name]: kernel_constants[codeobj.name].append(line) for line in additional_host_code_lines: if not line in additional_host_code[codeobj.name]: additional_host_code[codeobj.name].append(line) # Generate the code objects for codeobj in self.code_objects.values(): ns = codeobj.variables def _replace_constants_and_parameters(code): # HOST_CONSTANTS are equivalent to C++ Standalone's CONSTANTS code = code.replace('%HOST_CONSTANTS%', '\n\t\t'.join(code_object_defs[codeobj.name])) # ADDITIONAL_HOST_CODE is extra code, which needs `_N` code = code.replace('%ADDITIONAL_HOST_CODE%', '\n\t\t'.join(additional_host_code[codeobj.name])) # KERNEL_CONSTANTS are the same for inside device kernels code = code.replace('%KERNEL_CONSTANTS%', '\n\t'.join(kernel_constants[codeobj.name])) # HOST_PARAMETERS are parameters that device kernels are called with from host code code = code.replace('%HOST_PARAMETERS%', ',\n\t\t\t'.join(host_parameters[codeobj.name])) # KERNEL_PARAMETERS are the same names of the same parameters inside the device kernels code = code.replace('%KERNEL_PARAMETERS%', ',\n\t'.join(kernel_parameters[codeobj.name])) code = code.replace('%CODEOBJ_NAME%', codeobj.name) return code # Before/after run code for block in codeobj.before_after_blocks: cu_code = getattr(codeobj.code, block + '_cu_file') cu_code = _replace_constants_and_parameters(cu_code) h_code = getattr(codeobj.code, block + '_h_file') writer.write('code_objects/' + block + '_' + codeobj.name + '.cu', cu_code) writer.write('code_objects/' + block + '_' + codeobj.name + '.h', h_code) # Main code code = codeobj.code.cu_file if len(host_parameters[codeobj.name]) == 0: host_parameters[codeobj.name].append("0") kernel_parameters[codeobj.name].append("int dummy") code = _replace_constants_and_parameters(code) # substitue in generated code double types with float types in # single-precision mode if prefs['core.default_float_dtype'] == np.float32: # cast time differences (double) to float in event-drive updates sub = 't - lastupdate' if sub in code: code = code.replace(sub, f'float({sub})') logger.debug(f"Replaced {sub} with float({sub}) in {codeobj.name}") # replace double-precision floating-point literals with their # single-precision version (e.g. `1.0` -> `1.0f`) code = replace_floating_point_literals(code) logger.debug( f"Replaced floating point literals by single precision version " f"(appending `f`) in {codeobj.name}." ) writer.write('code_objects/'+codeobj.name+'.cu', code) writer.write('code_objects/'+codeobj.name+'.h', codeobj.code.h_file)
[docs] def generate_rand_source(self, writer): # Device side binomial functions and poisson functions with variables lambda use # the curand device api. The curand states (one per thread executed in parallel) # are initialized in rand.cu, where as many curand states are initialized as the # size of the largest codeobject with curand device api calls. needed_number_curand_states = {} for co in (self.codeobjects_with_rng["device_api"]["every_tick"] + self.codeobjects_with_rng["device_api"]["single_tick"]): co_name = co.owner.name if co_name not in needed_number_curand_states: if isinstance(co.owner, Synapses): # this is the pointer to the synapse object's N, which is a # null pointer before synapses are generated and an int ptr # after synapse generation (used to test if synapses # already generated or not) N_ptr = f'_array_{co_name}_N' N_value = N_ptr + '[0]' else: N_ptr = None N_value = co.owner._N needed_number_curand_states[co_name] = (N_ptr, N_value) rand_tmp = self.code_object_class().templater.rand(None, None, codeobjects_with_rng_per_run=self.codeobjects_with_rng["host_api"]["per_run"], all_poisson_lamdas=self.all_poisson_lamdas, needed_number_curand_states=needed_number_curand_states, number_run_calls=len(self.codeobjects_with_rng["host_api"]["per_run"]), profiled=self.enable_profiling, curand_float_type=c_data_type(prefs['core.default_float_dtype'])) writer.write('rand.*', rand_tmp)
[docs] def copy_source_files(self, writer, directory): # Copy the brianlibdirectory brianlib_dir = os.path.join(os.path.split(inspect.getsourcefile(self.code_object_class()))[0], 'brianlib') brianlib_files = copy_directory(brianlib_dir, os.path.join(directory, 'brianlib')) for file in brianlib_files: if file.lower().endswith('.cpp'): writer.source_files.add('brianlib/'+file) if file.lower().endswith('.cu'): writer.source_files.add('brianlib/'+file) elif file.lower().endswith('.h'): writer.header_files.add('brianlib/'+file)
[docs] def generate_network_source(self, writer): maximum_run_time = self._maximum_run_time if maximum_run_time is not None: maximum_run_time = float(maximum_run_time) network_tmp = self.code_object_class().templater.network(None, None, maximum_run_time=maximum_run_time, eventspace_arrays=self.eventspace_arrays, spikegenerator_eventspaces=self.spikegenerator_eventspaces) writer.write('network.*', network_tmp)
[docs] def generate_synapses_classes_source(self, writer): synapses_classes_tmp = self.code_object_class().templater.synapses_classes(None, None) writer.write('synapses_classes.*', synapses_classes_tmp)
[docs] def generate_run_source(self, writer): run_tmp = self.code_object_class().templater.run(None, None, run_funcs=self.runfuncs, code_objects=self.code_objects.values(), user_headers=self.headers, array_specs=self.arrays, clocks=self.clocks) writer.write('run.*', run_tmp)
[docs] def generate_makefile(self, writer, cpp_compiler, cpp_compiler_flags, cpp_linker_flags, debug, disable_asserts): available_gpu_arch_flags = ( '--gpu-architecture', '-arch', '--gpu-code', '-code', '--generate-code', '-gencode' ) nvcc_compiler_flags = prefs.devices.cuda_standalone.cuda_backend.extra_compile_args_nvcc gpu_arch_flags = [] for flag in nvcc_compiler_flags: if flag.startswith(available_gpu_arch_flags): gpu_arch_flags.append(flag) nvcc_compiler_flags.remove(flag) elif flag.startswith(('-w', '--disable-warnings')): # add the flage to linker flags, else linking will give warnings cpp_linker_flags.append(flag) # Make the linker options (meant to be passed to `gcc`) compatible with `nvcc` for i, flag in enumerate(cpp_linker_flags): if flag.startswith('-Wl,'): # -Wl,<option> passes <option> directly to linker # for gcc `-Wl,<option>`, for nvcc `-Xlinker "<option>"` cpp_linker_flags[i] = flag.replace('-Wl,', '-Xlinker ') # Check if compute capability was set manually via preference compute_capability_pref = prefs.devices.cuda_standalone.cuda_backend.compute_capability # If GPU architecture was set via `extra_compile_args_nvcc` and # `compute_capability`, ignore `compute_capability` if gpu_arch_flags and compute_capability_pref is not None: logger.warn( "GPU architecture for compilation was specified via " "`prefs.devices.cuda_standalone.cuda_backend.compute_capability` and " "`prefs.devices.cuda_standalone.cuda_backend.extra_compile_args_nvcc`. " "`prefs.devices.cuda_standalone.cuda_backend.compute_capability` will be ignored. " "To get rid of this warning, set " "`prefs.devices.cuda_standalone.brian_backend.compute_capability` to it's default " "value `None`" ) # Ignore compute capability of chosen GPU and the one manually set via # `compute_capability` preferences. self.compute_capability = None # If GPU architecture was set only via `extra_compile_args_nvcc`, use that elif gpu_arch_flags: # Ignore compute capability of chosen GPU self.compute_capability = None # If GPU architecture was set only via `compute_capability` prefs, use that elif compute_capability_pref is not None: self.compute_capability = compute_capability_pref # If compute_capability wasn't set manually, the one from the chosen GPU is used # (stored in self.compute_capability, see self.generate_main_source()) if self.compute_capability is not None: # check if compute capability is supported if self.compute_capability < self.minimal_compute_capability: raise NotImplementedError( f"Compute capability `{self.compute_capability}` is not supported. " f"Minimal supported compute capability is " f"`{self.minimal_compute_capability}`." ) # If GPU architecture is detected automatically or set via `compute_capability` # prefs, we still need to add it as a compile argument if not gpu_arch_flags: # Turn float (3.5) into string ("35") compute_capability_str = ''.join(str(self.compute_capability).split('.')) gpu_arch_flags.append(f"-arch=sm_{compute_capability_str}") # Log compiled GPU architecture if self.compute_capability is None: logger.info( f"Compiling device code with manually set architecture flags " f"({gpu_arch_flags}). Be aware that the minimal supported compute " f"capability is {self.minimal_compute_capability} " "(we are not checking your compile flags)" ) else: logger.info( f"Compiling device code for compute capability " f"{self.compute_capability} (compiler flags: {gpu_arch_flags})" ) nvcc_path = get_nvcc_path() if cpp_compiler=='msvc': # Check CPPStandaloneDevice.generate_makefile() for how to do things raise RuntimeError("Windows is currently not supported. See https://github.com/brian-team/brian2cuda/issues/225") else: # Generate the makefile if os.name=='nt': rm_cmd = 'del *.o /s\n\tdel main.exe $(DEPS)' else: rm_cmd = 'rm $(OBJS) $(PROGRAM) $(DEPS)' if debug: compiler_debug_flags = '-g -DDEBUG -G -DTHRUST_DEBUG' linker_debug_flags = '-g -G' else: compiler_debug_flags = '' linker_debug_flags = '' if disable_asserts: # NDEBUG precompiler macro disables asserts (both for C++ and CUDA) nvcc_compiler_flags += ['-NDEBUG'] makefile_tmp = self.code_object_class().templater.makefile( None, None, source_files=' '.join(sorted(writer.source_files)), header_files=' '.join(sorted(writer.header_files)), cpp_compiler_flags=' '.join(cpp_compiler_flags), compiler_debug_flags=compiler_debug_flags, linker_debug_flags=linker_debug_flags, cpp_linker_flags=' '.join(cpp_linker_flags), nvcc_compiler_flags=' '.join(nvcc_compiler_flags), gpu_arch_flags=' '.join(gpu_arch_flags), nvcc_path=nvcc_path, rm_cmd=rm_cmd, ) writer.write('makefile', makefile_tmp)
[docs] def build(self, directory='output', compile=True, run=True, debug=False, clean=False, with_output=True, disable_asserts=False, additional_source_files=None, run_args=None, direct_call=True, **kwds): ''' Build the project TODO: more details Parameters ---------- directory : str, optional The output directory to write the project to, any existing files will be overwritten. If the given directory name is ``None``, then a temporary directory will be used (used in the test suite to avoid problems when running several tests in parallel). Defaults to ``'output'``. compile : bool, optional Whether or not to attempt to compile the project. Defaults to ``True``. run : bool, optional Whether or not to attempt to run the built project if it successfully builds. Defaults to ``True``. debug : bool, optional Whether to compile in debug mode. Defaults to ``False``. with_output : bool, optional Whether or not to show the ``stdout`` of the built program when run. Output will be shown in case of compilation or runtime error. Defaults to ``True``. clean : bool, optional Whether or not to clean the project before building. Defaults to ``False``. additional_source_files : list of str, optional A list of additional ``.cu`` files to include in the build. direct_call : bool, optional Whether this function was called directly. Is used internally to distinguish an automatic build due to the ``build_on_run`` option from a manual ``device.build`` call. ''' if self.build_on_run and direct_call: raise RuntimeError('You used set_device with build_on_run=True ' '(the default option), which will automatically ' 'build the simulation at the first encountered ' 'run call - do not call device.build manually ' 'in this case. If you want to call it manually, ' 'e.g. because you have multiple run calls, use ' 'set_device with build_on_run=False.') if self.has_been_run: raise RuntimeError('The network has already been built and run ' 'before. To build several simulations in ' 'the same script, call "device.reinit()" ' 'and "device.activate()". Note that you ' 'will have to set build options (e.g. the ' 'directory) and defaultclock.dt again.') renames = {'project_dir': 'directory', 'compile_project': 'compile', 'run_project': 'run'} if len(kwds): msg = '' for kwd in kwds: if kwd in renames: msg += ("Keyword argument '%s' has been renamed to " "'%s'. ") % (kwd, renames[kwd]) else: msg += f"Unknown keyword argument '{kwd}'. " raise TypeError(msg) if debug and disable_asserts: logger.warn("You have disabled asserts in debug mode. Are you sure this is what you wanted to do?") if additional_source_files is None: additional_source_files = [] if run_args is None: run_args = [] if directory is None: directory = tempfile.mkdtemp(prefix='brian_standalone_') self.project_dir = directory ensure_directory(directory) # Determine compiler flags and directories cpp_compiler, cpp_default_extra_compile_args = get_compiler_and_args() extra_compile_args = self.extra_compile_args + cpp_default_extra_compile_args extra_link_args = self.extra_link_args + prefs['codegen.cpp.extra_link_args'] codeobj_define_macros = [macro for codeobj in self.code_objects.values() for macro in codeobj.compiler_kwds.get('define_macros', [])] define_macros = (self.define_macros + prefs['codegen.cpp.define_macros'] + codeobj_define_macros) codeobj_include_dirs = [include_dir for codeobj in self.code_objects.values() for include_dir in codeobj.compiler_kwds.get('include_dirs', [])] include_dirs = (self.include_dirs + prefs['codegen.cpp.include_dirs'] + codeobj_include_dirs) codeobj_library_dirs = [library_dir for codeobj in self.code_objects.values() for library_dir in codeobj.compiler_kwds.get('library_dirs', [])] library_dirs = (self.library_dirs + prefs['codegen.cpp.library_dirs'] + codeobj_library_dirs) codeobj_runtime_dirs = [runtime_dir for codeobj in self.code_objects.values() for runtime_dir in codeobj.compiler_kwds.get('runtime_library_dirs', [])] runtime_library_dirs = (self.runtime_library_dirs + prefs['codegen.cpp.runtime_library_dirs'] + codeobj_runtime_dirs) codeobj_libraries = [library for codeobj in self.code_objects.values() for library in codeobj.compiler_kwds.get('libraries', [])] libraries = (self.libraries + prefs['codegen.cpp.libraries'] + codeobj_libraries) cpp_compiler_obj = ccompiler.new_compiler(compiler=cpp_compiler) cpp_compiler_flags = (ccompiler.gen_preprocess_options(define_macros, include_dirs) + extra_compile_args) cpp_linker_flags = (ccompiler.gen_lib_options(cpp_compiler_obj, library_dirs=library_dirs, runtime_library_dirs=runtime_library_dirs, libraries=libraries) + extra_link_args) codeobj_source_files = [source_file for codeobj in self.code_objects.values() for source_file in codeobj.compiler_kwds.get('sources', [])] additional_source_files += codeobj_source_files for d in ['code_objects', 'results', 'static_arrays']: ensure_directory(os.path.join(directory, d)) self.writer = CUDAWriter(directory) logger.diagnostic("Writing CUDA standalone project to directory "+os.path.normpath(directory)) self.write_static_arrays(directory) # Check that all names are globally unique names = [obj.name for net in self.networks for obj in net.objects] non_unique_names = [name for name, count in Counter(names).items() if count > 1] if len(non_unique_names): formatted_names = ', '.join(f"'{name}'" for name in non_unique_names) raise ValueError('All objects need to have unique names in ' 'standalone mode, the following name(s) were used ' 'more than once: %s' % formatted_names) # Collect all multisynaptic indices in all Synapses (that have them) self.multisyn_vars = [] for syn in self.synapses: if syn.multisynaptic_index is not None: self.multisyn_vars.append(syn.variables[syn.multisynaptic_index]) # Collect all variables that are stored on host only and should not be copied # from device to host at all (e.g. when set by constant or array or at the end # of the simulation) self.variables_on_host_only = [] for var, varname in self.arrays.items(): try: is_mon = isinstance(var.owner, (StateMonitor, SpikeMonitor, EventMonitor)) except ReferenceError: # some variable ownders are weakreference that don't exist anymore # https://github.com/brian-team/brian2cuda/issues/296#issuecomment-1145085524 continue if is_mon and var.name == 'N': # The size variable of monitors is managed on host via device vectors self.variables_on_host_only.append(varname) if var.name in ('t', 'dt', 'timestep'): # We manage time variables on host and pass them by value to kernels self.variables_on_host_only.append(varname) for var, varname in self.dynamic_arrays.items(): varnames = ['_synaptic_pre', '_synaptic_post'] try: # TODO: Manage Spike- & EventMonitor t also on host (modify template) is_monitor_t = isinstance(var.owner, StateMonitor) and var.name == 't' except ReferenceError: continue if var in self.multisyn_vars or var.name in varnames or is_monitor_t: self.variables_on_host_only.append(varname) if var.name == 'delay': # By default, delete all delay variables after # before_run_synapses_push_spikes, but only those that weren't already # set to be deleted during variable set calls (in # self.variableview_set_with_index_array and self.fill_with_array) if varname not in self.delete_synaptic_delay: self.delete_synaptic_delay[varname] = True # This avoids copying the delted delay array from device to host # at the end of the simulation self.variables_on_host_only.append(varname) self.generate_main_source(self.writer) # Create lists of codobjects using rand, randn, poisson or binomial across all # runs (needed for variable declarations). # - Variables needed for device side rand/randn/poisson are declared in objects.cu: # codeobjects_with_rng["host_api"]["all_runs"]['rand'/'rand'/'poisson'] are needed in `generate_objects_source` # - Variables needed for device side binomial functions are initialized in rand.cu: # codeobjects_with_rng["device_api"]["every_tick"] is needed in `generate_rand_source` for run_codeobj in self.codeobjects_with_rng["host_api"]["per_run"]: for key in run_codeobj.keys(): # keys: 'rand', 'randn', 'poisson-<idx>' self.codeobjects_with_rng["host_api"]["all_runs"][key].extend(run_codeobj[key]) self.generate_codeobj_source(self.writer) self.generate_objects_source(self.writer, self.arange_arrays, self.synapses, self.static_array_specs, self.networks) self.generate_network_source(self.writer) self.generate_synapses_classes_source(self.writer) self.generate_run_source(self.writer) self.generate_rand_source(self.writer) self.copy_source_files(self.writer, directory) self.writer.source_files.update(additional_source_files) self.generate_makefile(self.writer, cpp_compiler, cpp_compiler_flags, cpp_linker_flags, debug, disable_asserts) # Not sure what the best place is to call Network.after_run -- at the # moment the only important thing it does is to clear the objects stored # in magic_network. If this is not done, this might lead to problems # for repeated runs of standalone (e.g. in the test suite). for net in self.networks: net.after_run() logger.info("Using the following preferences for CUDA standalone:") for pref_name in prefs: if "devices.cuda_standalone" in pref_name: logger.info(f"\t{pref_name} = {prefs[pref_name]}") logger.debug("Using the following brian preferences:") for pref_name in prefs: if "devices.cuda_standalone" not in pref_name: logger.debug(f"\t{pref_name} = {prefs[pref_name]}") if compile: self.compile_source(directory, cpp_compiler, debug, clean) if run: self.run(directory, with_output, run_args)
[docs] def network_run(self, net, duration, report=None, report_period=10*second, namespace=None, profile=False, level=0, **kwds): ################################################### ### This part is copied from CPPStandaoneDevice ### ################################################### self.networks.add(net) if kwds: logger.warn(('Unsupported keyword argument(s) provided for run: ' '%s') % ', '.join(kwds.keys())) # We store this as an instance variable for later access by the # `code_object` method self.enable_profiling = profile # Allow setting `profile` in the `set_device` call (used e.g. in brian2cuda # SpeedTest configurations) if 'profile' in self.build_options: build_profile = self.build_options.pop('profile') if build_profile: self.enable_profiling = True all_objects = net.sorted_objects net._clocks = {obj.clock for obj in all_objects} t_end = net.t+duration for clock in net._clocks: clock.set_interval(net.t, t_end) # Get the local namespace if namespace is None: namespace = get_local_namespace(level=level+2) net.before_run(namespace) self.synapses |= {s for s in net.objects if isinstance(s, Synapses)} self.clocks.update(net._clocks) net.t_ = float(t_end) # TODO: remove this horrible hack for clock in self.clocks: if clock.name=='clock': clock._name = '_clock' # Extract all the CodeObjects # Note that since we ran the Network object, these CodeObjects will be sorted into the right # running order, assuming that there is only one clock code_objects = [] for obj in all_objects: if obj.active: for codeobj in obj._code_objects: code_objects.append((obj.clock, codeobj)) # Code for a progress reporting function standard_code = ''' std::string _format_time(float time_in_s) { float divisors[] = {24*60*60, 60*60, 60, 1}; char letters[] = {'d', 'h', 'm', 's'}; float remaining = time_in_s; std::string text = ""; int time_to_represent; for (int i =0; i < sizeof(divisors)/sizeof(float); i++) { time_to_represent = int(remaining / divisors[i]); remaining -= time_to_represent * divisors[i]; if (time_to_represent > 0 || text.length()) { if(text.length() > 0) { text += " "; } text += (std::to_string(time_to_represent)+letters[i]); } } //less than one second if(text.length() == 0) { text = "< 1s"; } return text; } void report_progress(const double elapsed, const double completed, const double start, const double duration) { if (completed == 0.0) { %STREAMNAME% << "Starting simulation at t=" << start << " s for duration " << duration << " s"; } else { %STREAMNAME% << completed*duration << " s (" << (int)(completed*100.) << "%) simulated in " << _format_time(elapsed); if (completed < 1.0) { const int remaining = (int)((1-completed)/completed*elapsed+0.5); %STREAMNAME% << ", estimated " << _format_time(remaining) << " remaining."; } } %STREAMNAME% << std::endl << std::flush; } ''' if report is None: report_func = '' elif report == 'text' or report == 'stdout': report_func = standard_code.replace('%STREAMNAME%', 'std::cout') elif report == 'stderr': report_func = standard_code.replace('%STREAMNAME%', 'std::cerr') elif isinstance(report, str): report_func = ''' void report_progress(const double elapsed, const double completed, const double start, const double duration) { %REPORT% } '''.replace('%REPORT%', report) else: raise TypeError(('report argument has to be either "text", ' '"stdout", "stderr", or the code for a report ' 'function')) if report_func != '': if self.report_func != '' and report_func != self.report_func: raise NotImplementedError('The CUDA standalone device does not ' 'support multiple report functions, ' 'each run has to use the same (or ' 'none).') self.report_func = report_func if report is not None: report_call = 'report_progress' else: report_call = 'NULL' ############################################################## ### From here on the code differs from CPPStandaloneDevice ### ############################################################## # For each codeobject of this run check if it uses rand, randn, poisson or # binomials. Store these as attributes of the codeobject and create # lists of codeobjects that use rand, randn, poisson or binomials. This only # checks codeobject in the network, meaning only the ones running every # clock cycle. # self.codeobjects_with_rng["host_api"]["per_run"] is a list (one per run) of defaultdicts # with keys 'rand', 'randn', 'poisson_<idx>' and values being lists of # codeobjects. self.codeobjects_with_rng["host_api"]["per_run"].append(defaultdict(list)) run_idx = -1 # last list index # Count random number ocurrences in codeobjects run every tick for _, co in code_objects: # (clock, code_object) prepare_codeobj_code_for_rng(co) if co.rng_calls["rand"] > 0: self.codeobjects_with_rng["host_api"]["per_run"][run_idx]['rand'].append(co) if co.rng_calls["randn"] > 0: self.codeobjects_with_rng["host_api"]["per_run"][run_idx]['randn'].append(co) for poisson_name, lamda in co.poisson_lamdas.items(): self.all_poisson_lamdas[co.name][poisson_name] = lamda self.codeobjects_with_rng["host_api"]["per_run"][run_idx][poisson_name].append(co) if co.needs_curand_states: if co not in self.codeobjects_with_rng["device_api"]["every_tick"]: self.codeobjects_with_rng["device_api"]["every_tick"].append(co) # To profile SpeedTests, we need to be able to set `profile` in # `set_device`. Here we catch that case. if 'profile' in self.build_options: build_profile = self.build_options.pop('profile') if build_profile: self.enable_profiling = True # Generate the updaters run_lines = [] run_lines.append(f'{net.name}.clear();') # create all random numbers needed for the next clock cycle for clock in net._clocks: run_lines.append(f'{net.name}.add(&{clock.name}, _run_random_number_buffer);') all_clocks = set() for clock, codeobj in code_objects: run_lines.append(f'{net.name}.add(&{clock.name}, _run_{codeobj.name});') all_clocks.add(clock) # Under some rare circumstances (e.g. a NeuronGroup only defining a # subexpression that is used by other groups (via linking, or recorded # by a StateMonitor) *and* not calculating anything itself *and* using a # different clock than all other objects) a clock that is not used by # any code object should nevertheless advance during the run. We include # such clocks without a code function in the network. for clock in net._clocks: if clock not in all_clocks: run_lines.append(f'{net.name}.add(&{clock.name}, NULL);') run_lines.extend(self.code_lines['before_network_run']) # run everything that is run on a clock run_lines.append( f'{net.name}.run({float(duration)!r}, {report_call}, {float(report_period)!r});' ) run_lines.extend(self.code_lines['after_network_run']) # for multiple runs, the random number buffer needs to be reset run_lines.append('random_number_buffer.run_finished();') # nvprof stuff run_lines.append('CUDA_SAFE_CALL(cudaDeviceSynchronize());') run_lines.append('CUDA_SAFE_CALL(cudaProfilerStop());') self.main_queue.append(('run_network', (net, run_lines))) net.after_run() # Manually set the cache for the clocks, simulation scripts might # want to access the time (which has been set in code and is therefore # not accessible by the normal means until the code has been built and # run) for clock in net._clocks: self.array_cache[clock.variables['timestep']] = np.array([clock._i_end]) self.array_cache[clock.variables['t']] = np.array([clock._i_end * clock.dt_]) # Initialize eventspaces with -1 before the network runs for codeobj in self.code_objects.values(): if codeobj.template_name == "threshold" or codeobj.template_name == "spikegenerator": for key in codeobj.variables.keys(): if key.endswith('space'): # get the correct eventspace name eventspace_name = self.get_array_name(codeobj.variables[key], False) # In case of custom scheduling, the thresholder might come after synapses or monitors # and needs to be initialized in the beginning of the simulation # See generate_main_source() for main_queue formats # Initialize entire eventspace array with -1 at beginning of main self.main_queue.insert( 0, # list insert position # func , (arrayname, value, is_dynamic) ('set_by_constant', (eventspace_name, -1, False)) ) # Set the last value (index N) in the eventspace array to 0 (-> event counter) self.main_queue.insert( 1, # list insert position ( 'set_by_single_value', # func # arrayname , item, , value (eventspace_name, f"_num_{eventspace_name} - 1", 0) ) ) if self.build_on_run: if self.has_been_run: raise RuntimeError('The network has already been built and run ' 'before. Use set_device with ' 'build_on_run=False and an explicit ' 'device.build call to use multiple run ' 'statements with this device.') self.build(direct_call=False, **self.build_options) self.first_run = False
[docs] def fill_with_array(self, var, *args, **kwargs): # If the delay variable is set after the first run call, do not delete it on the # device (which is happening by default) if not self.first_run and var.name == 'delay': synaptic_delay_array_name = self.get_array_name(var, access_data=False) self.delete_synaptic_delay[synaptic_delay_array_name] = False super().fill_with_array(var, *args, **kwargs)
[docs] def variableview_set_with_index_array(self, variableview, *args, **kwargs): # If the delay variable is set after the first run call, do not delete it on the # device (which is happening by default) if not self.first_run and variableview.name == 'delay': synaptic_delay_array_name = self.get_array_name(variableview.variable, access_data=False) self.delete_synaptic_delay[synaptic_delay_array_name] = False super().variableview_set_with_index_array(variableview, *args, **kwargs)
[docs] def network_store(self, net, *args, **kwds): raise NotImplementedError(('The store/restore mechanism is not ' 'supported in CUDA standalone'))
[docs] def network_restore(self, net, *args, **kwds): raise NotImplementedError(('The store/restore mechanism is not ' 'supported in CUDA standalone'))
[docs]def prepare_codeobj_code_for_rng(codeobj): ''' Prepare a CodeObject for random number generation (RNG). There are two different ways that random numbers are generated in CUDA: 1) Using a buffer system which is refilled from host code in regular intervals using the cuRAND host API. This is used for `rand()`, `randn()` and `poisson(lambda)` when `lambda` is a scalar. The buffer system is implemented in the `rand.cu` template. 2) Using on-the-fly RNG from device code using the cuRAND device API. This is used for `binomial` and `poisson(lambda)` when `lambda` is a vectorized variable (different across neurons/synapses). This needs initilization of cuRAND random states, which is also happening in the `rand.cu` template. This function counts the number of `rand()`, `randn()` and `poisson(<lambda>)` appearances in `codeobj.code.cu_file` and stores this number in the `codeobj.rng_calls` dictionary (with keys `"rand"`, `"randn"` and `"poisson_<idx>"` ,one <idx> per `poisson()` call). If the codeobject uses the curand device API for RNG (for binomial of poisson with variable lambda), this function sets `codeobj.needs_curand_states = True`. For RNG functions that use the buffer system, this function replaces the function arguments in the generated code such that a pointer to the random number buffer and the correct index are passed as function arguments. For RNG functions that use on-the-fly RNG, the functions are not replaced since no pointer or index has to be passed. For the `poisson` RNG, the RNG type depends on the `lambda` value. For scalar `lambda`, we use the buffer system which is most efficient and most robust in the RNG. For vectorized `lambda` values, the host API is inefficient and instead the simple device API is used, which is the most efficient but least robust. For the two RNG systems to work, we overload the CUDA implementation of `_poisson` with `_poisson(double _lambda, ...)` and `_poisson(unsigned int* _poisson_buffer, ...)`. When the buffer system is used, we replace the `_poisson(<lambda>, ...)` calls with `_poisson(<int_pointer>, ...)` calls. For `poisson` with `lambda <= 0`, the returned random numbers are always `0`. This function makes sure that the `lambda` is replaced with a double literal for our overloaded `_poisson` function to work correctly. Parameters ---------- codeobj: CodeObjects Codeobject with generated CUDA code in `codeobj.code.cu_file`. ''' # synapses_create_generator uses host side random number generation if codeobj.template_name == 'synapses_create_generator': return ### RAND/N REGEX # regex explained # (?<!...) negative lookbehind: don't match if ... preceeds # XXX: This only excludes #define lines which have exactly one space to '_rand'. # As long is we don't have '#define _rand' with 2 spaces, thats fine. # \b - non-alphanumeric character (word boundary, does not consume a character) pattern_template = r'(?<!#define )\b_{rng_func}\(_vectorisation_idx\)' rand_randn_pattern = { 'rand': pattern_template.format(rng_func='rand'), 'randn': pattern_template.format(rng_func='randn'), } # Store number of matches in codeobj.rng_calls dictionary for rng_type in ['rand', 'randn']: matches = re.findall(rand_randn_pattern[rng_type], codeobj.code.cu_file) num_calls = len(matches) codeobj.rng_calls[rng_type] = num_calls logger.diagnostic( f"Matched {num_calls} {rng_type} calls for {codeobj.name}" ) ### POISSON REGEX # (?P<lambda>*?) Named group: Returns whatever is matched inside the brackets # instead of the entire string and stores it in the variable # `lamda`. # `.*?` does a non-greedy match of all (*). This is the lambda value. poisson_pattern = r'(?<!#define )\b_poisson\((?P<lamda>.*?), _vectorisation_idx\)' matches_poisson = re.findall(poisson_pattern, codeobj.code.cu_file) # Collect the number of poisson calls separated by lambda values (we need to # generate poisson values separately for each lambda value when using cuRAND host # API). We call the poisson functions with different lambda `poisson-<idx>`, where # `<idx>` is enuemerates all poisson functions. It looks like this: # # codeobj.rng_calls.keys() = ["poisson_0", "poisson_1", ...] # codeobj.rng_calls["poisson_0"] = <number_of_calls_per_time_step> # # The lamda values for all poisson functions are stored in # device.all_poisson_lamdas[codeobj.name]["poisson_0"] = <lambda_value> lamda_matches = {} poisson_device_api = False poisson_with_lamda_zero = [] for i, lamda_match in enumerate(sorted(set(matches_poisson))): poisson_name = f"poisson_{i}" # Test if the lambda_match from poisson(<lambda_match>) is scalar or vectorized # (different across neurons/synapses of the codeobject owner) try: # Try to convert it to float, will raise ValueError if not possible # This will work only if `lambda_match` is a literal, e.g. in `poisson(5)` lamda = float(lamda_match) lamda_is_scalar = True logger.debug( f"Matched literal scalar lambda {lamda} for {poisson_name} in {codeobj.name}" ) except ValueError: # lamda is not a float but a variable, e.g. `poisson(var)` # Check if lamda is scalar and constant (i.e. doesn't change during a run) # TODO: check if scalar variable can be set during in run. If so, check for # constant here as well! # TODO: make sure that a scalar and constand variable can't be changed # during a run! if lamda_match in codeobj.variables and codeobj.variables[lamda_match].scalar: lamda = codeobj.variables[lamda_match].value const = codeobj.variables[lamda_match].constant lamda_is_scalar = True logger.debug( f"Matched non-literal scalar lambda {lamda} for {poisson_name} in {codeobj.name}" ) else: # lamda is an array variable lamda = lamda_match lamda_is_scalar = False logger.debug( f"Matched vectorized lambda {lamda} in {codeobj.name}" ) if lamda_is_scalar: # We don't want to generate random numbers on the host for lambda <= 0, # which should return 0 anyways. We use the _poisson(double, ...) function, # which checks for lambda <= 0 before calling the curand devica API. Hence # we don't need host side RNG or curand states. lamda_matches[poisson_name] = lamda_match if lamda <= 0: # We need to replace '0' with '0.0' (double literal) for lambda in this # case, see comment below poisson_with_lamda_zero.append(poisson_name) continue assert lamda not in codeobj.poisson_lamdas.values() assert poisson_name not in codeobj.poisson_lamdas.keys() codeobj.poisson_lamdas[poisson_name] = lamda codeobj.rng_calls[poisson_name] = matches_poisson.count(lamda_match) else: codeobj.needs_curand_states = True poisson_device_api = True # We have two if/else code paths in synapses code (homog. / heterog. delay mode), # therefore we have twice as much matches for rand/randn/poisson-<idx> as actual # calls. Hence we half the number of detected calls here. if codeobj.template_name == 'synapses': for rng_type in codeobj.rng_calls.keys(): assert codeobj.rng_calls[rng_type] % 2 == 0 codeobj.rng_calls[rng_type] //= 2 # RAND/N # Substitue the _vectorisation_idx of _rand/n calls such that different calls always # get different random numbers from the random number buffers # Substitute rand/n arguments twice for synapses templates repeat = 2 if codeobj.template_name == 'synapses' else 1 for rng_type in ["rand", "randn"]: if codeobj.rng_calls[rng_type] > 0: for _ in range(repeat): for i in range(codeobj.rng_calls[rng_type]): codeobj.code.cu_file = re.sub( rand_randn_pattern[rng_type], f"_{rng_type}(_vectorisation_idx + {i} * _N)", codeobj.code.cu_file, count=1 ) # POISSON sub_repl_template = ( "_poisson(_ptr_array_{codeobj.name}_{poisson_name}, _vectorisation_idx + {i} * _N)" ) for poisson_name, lamda_match in lamda_matches.items(): sub_pattern = poisson_pattern.replace( # use the correct lamda instead of matching the lamda "(?P<lamda>.*?)", lamda_match ) if codeobj.rng_calls[poisson_name] > 0: for _ in range(repeat): for i in range(codeobj.rng_calls[poisson_name]): sub_repl = sub_repl_template.format( codeobj=codeobj, poisson_name=poisson_name, i=i ) codeobj.code.cu_file = re.sub( sub_pattern, sub_repl, codeobj.code.cu_file, count=1 ) elif poisson_name in poisson_with_lamda_zero: # Make sure the _poisson argument is a double literal. "0" fails since # it can be interpreted as both, null pointer or double and the _poisson # implementation is overloaded for `unsigned int *` and `double`. sub_repl= f"_poisson({float(lamda_match):.1f}, _vectorisation_idx)" codeobj.code.cu_file = re.sub( sub_pattern, sub_repl, codeobj.code.cu_file, count=0 # replace all ocurrences at once ) # If the codeobjec does not need curand states for poisson, check if it needs # them for binomial calls if not codeobj.needs_curand_states: match = re.search('_binomial\w*\(const int vectorisation_idx\)', codeobj.code.cu_file) if match is not None: codeobj.needs_curand_states = True
cuda_standalone_device = CUDAStandaloneDevice() all_devices['cuda_standalone'] = cuda_standalone_device