Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CUDA wrapper capability. #714

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
110 changes: 110 additions & 0 deletions demo/nvrtc_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#include "Components.h"
#include "FacetIntegrals.h"
#include "HyperElasticity.h"
#include "MathFunctions.h"
#include "StabilisedStokes.h"
#include "VectorPoisson.h"
#include "ufcx.h"
#include "nvrtc.h"
#include <iostream>
#include <stdexcept>
#include <sstream>
#include <string>
#include <vector>

void check_nvrtc_compilation(ufcx_form* form)
{
// extract kernel
ufcx_integral* integral = form->form_integrals[0];
ufcx_tabulate_tensor_cuda_nvrtc* kernel = integral->tabulate_tensor_cuda_nvrtc;
// call kernel to get CUDA-wrapped source code
int num_program_headers;
const char** program_headers;
const char** program_include_names;
const char* program_src;
const char* tabulate_tensor_function_name;
if (!kernel) {
throw std::runtime_error("NVRTC wrapper function is NULL!");
}
(*kernel)(
&num_program_headers, &program_headers,
&program_include_names, &program_src,
&tabulate_tensor_function_name);
// compile CUDA-wrapped source code with NVRTC
// with proper error checking

nvrtcResult nvrtc_err;
nvrtcProgram program;
nvrtc_err = nvrtcCreateProgram(
&program, program_src, tabulate_tensor_function_name,
num_program_headers, program_headers,
program_include_names);

if (nvrtc_err != NVRTC_SUCCESS) {
throw std::runtime_error(
"nvrtcCreateProgram() failed with " +
std::string(nvrtcGetErrorString(nvrtc_err)) + " "
"at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
}

int num_compile_options = 0;
const char** compile_options;
// Compile the CUDA C++ program
nvrtcResult nvrtc_compile_err = nvrtcCompileProgram(
program, num_compile_options, compile_options);
if (nvrtc_compile_err != NVRTC_SUCCESS) {
// If the compiler failed, obtain the compiler log
std::string program_log;
size_t log_size;
nvrtc_err = nvrtcGetProgramLogSize(program, &log_size);
if (nvrtc_err != NVRTC_SUCCESS) {
program_log = std::string(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Std::format?

"nvrtcGetProgramLogSize() failed with " +
std::string(nvrtcGetErrorString(nvrtc_err)) + " "
"at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
} else {
program_log.resize(log_size);
nvrtc_err = nvrtcGetProgramLog(
program, const_cast<char*>(program_log.c_str()));
if (nvrtc_err != NVRTC_SUCCESS) {
program_log = std::string(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Std::format?

"nvrtcGetProgramLog() failed with " +
std::string(nvrtcGetErrorString(nvrtc_err))) + " "
"at " + std::string(__FILE__) + ":" + std::to_string(__LINE__);
}
if (log_size > 0)
program_log.resize(log_size-1);
}
nvrtcDestroyProgram(&program);

std::stringstream ss;
ss << "nvrtcCompileProgram() failed with "
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can use std::format if you can switch to C++20, removes the need for using C++ terrible string formatting.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I tried switching this code to use std::format, however the C++ compiler in Github's CI environment appears to not support c++20 (tests worked locally, but broke during CI on Github).

<< nvrtcGetErrorString(nvrtc_compile_err) << "\n"
<< "CUDA C++ source code:\n"
<< std::string(60, '-') << "\n"
<< program_src
<< std::string(60, '-') << "\n"
<< "NVRTC compiler log:\n"
<< std::string(60, '-') << "\n"
<< program_log << "\n"
<< std::string(60, '-') << "\n";
throw std::runtime_error(ss.str());
}
}

int main()
{
std::vector<ufcx_form*> forms = {
form_Components_L,
form_FacetIntegrals_a,
form_HyperElasticity_a_F, form_HyperElasticity_a_J,
form_MathFunctions_a,
form_StabilisedStokes_a, form_StabilisedStokes_L,
form_VectorPoisson_a, form_VectorPoisson_L
};

for (ufcx_form* form : forms) check_nvrtc_compilation(form);

return 0;
}

50 changes: 50 additions & 0 deletions demo/test_demos.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,53 @@ def test_demo(file, scalar_type):
os.system(f"cd {demo_dir} && {cc} -I../ffcx/codegeneration {extra_flags} -c {file}.c")
== 0
)


@pytest.mark.parametrize("scalar_type", ["float64", "float32"])
def test_demo_nvrtc(scalar_type):
"""Test generated CUDA code with NVRTC."""
try:
from nvidia import cuda_nvrtc
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need an import skip on this test if cuda_nvrtc doesn't exist

Remove platform restrictions - I see no reason to artificially restrict this to Linux.

Copy link
Author

@bpachev bpachev Nov 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So currently the cuda_nvrtc package won't install on MacOS, which is the rationale for restriction to Linux. This is also why I stuck the package under optional dependencies, because those are only installed in the CI for Linux.

It is possible to get NVRTC on any platform by installing the entire CUDA Toolkit, however this is way too much overhead in this context.

except ImportError:
pytest.skip(reason="Must have NVRTC pip package installed to run test.")

if sys.platform.startswith("win32"):
pytest.skip(reason="NVRTC CUDA wrappers not currently supported for Windows.")

files = [
"Components",
"FacetIntegrals",
"HyperElasticity",
"MathFunctions",
"StabilisedStokes",
"VectorPoisson",
]
opts = f"--scalar_type {scalar_type} --cuda_nvrtc"
nvrtc_dir = os.path.dirname(os.path.realpath(cuda_nvrtc.__file__))
cc = os.environ.get("CC", "cc")
extra_flags = (
"-std=c17 -Wunused-variable -Werror -fPIC -Wno-error=implicit-function-declaration"
)
for file in files:
assert os.system(f"cd {demo_dir} && ffcx {opts} {file}.py") == 0
assert (
os.system(
f"cd {demo_dir} && "
f"{cc} -I../ffcx/codegeneration "
f"{extra_flags} "
f"-c {file}.c"
)
== 0
)

cxx = os.environ.get("CXX", "c++")
assert (
os.system(
f"cd {demo_dir} && "
f"{cxx} -I../ffcx/codegeneration -I{nvrtc_dir}/include -L{nvrtc_dir}/lib "
f" -Werror -o nvrtc_test nvrtc_test.cpp "
f"{' '.join([file+'.o' for file in files])} -l:libnvrtc.so.12"
)
== 0
)
assert os.system(f"LD_LIBRARY_PATH=$LD_LIBRARY_PATH:{nvrtc_dir}/lib {demo_dir}/nvrtc_test") == 0
13 changes: 12 additions & 1 deletion ffcx/codegeneration/C/integrals.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,17 +69,27 @@ def generator(ir: IntegralIR, options):
else:
code["tabulate_tensor_complex64"] = ".tabulate_tensor_complex64 = NULL,"
code["tabulate_tensor_complex128"] = ".tabulate_tensor_complex128 = NULL,"
if options.get("cuda_nvrtc"):
code["tabulate_tensor_cuda_nvrtc"] = (
f".tabulate_tensor_cuda_nvrtc = tabulate_tensor_cuda_nvrtc_{factory_name},"
)
code["tabulate_tensor_quoted"] = body.replace("\n", '\\n"\n "')
else:
code["tabulate_tensor_cuda_nvrtc"] = ""
code["tabulate_tensor_quoted"] = ""

np_scalar_type = np.dtype(options["scalar_type"]).name
code[f"tabulate_tensor_{np_scalar_type}"] = (
f".tabulate_tensor_{np_scalar_type} = tabulate_tensor_{factory_name},"
)

assert ir.expression.coordinate_element_hash is not None
implementation = ufcx_integrals.factory.format(
implementation = ufcx_integrals.get_factory(options).format(
factory_name=factory_name,
enabled_coefficients=code["enabled_coefficients"],
enabled_coefficients_init=code["enabled_coefficients_init"],
tabulate_tensor=code["tabulate_tensor"],
tabulate_tensor_quoted=code["tabulate_tensor_quoted"],
needs_facet_permutations="true" if ir.expression.needs_facet_permutations else "false",
scalar_type=dtype_to_c_type(options["scalar_type"]),
geom_type=dtype_to_c_type(dtype_to_scalar_dtype(options["scalar_type"])),
Expand All @@ -88,6 +98,7 @@ def generator(ir: IntegralIR, options):
tabulate_tensor_float64=code["tabulate_tensor_float64"],
tabulate_tensor_complex64=code["tabulate_tensor_complex64"],
tabulate_tensor_complex128=code["tabulate_tensor_complex128"],
tabulate_tensor_cuda_nvrtc=code["tabulate_tensor_cuda_nvrtc"],
)

return declaration, implementation
51 changes: 51 additions & 0 deletions ffcx/codegeneration/C/integrals_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,60 @@
{tabulate_tensor_float64}
{tabulate_tensor_complex64}
{tabulate_tensor_complex128}
{tabulate_tensor_cuda_nvrtc}
.needs_facet_permutations = {needs_facet_permutations},
.coordinate_element_hash = {coordinate_element_hash},
}};

// End of code for integral {factory_name}
"""

cuda_wrapper = """

// Begin NVRTC CUDA wrapper for integral {factory_name}
// The wrapper is compiled with a standard C++ compiler, and is called at runtime to generate
// source code which is then compiled into a CUDA kernel at runtime via NVRTC.
void tabulate_tensor_cuda_nvrtc_{factory_name}(int* num_program_headers,
const char*** program_headers,
const char*** program_include_names,
const char** out_program_src,
const char** tabulate_tensor_function_name)
{{
// The below typedefs are needed due to issues with including stdint.h in NVRTC source code
const char* program_src = ""
"#define alignas(x)\\n"
"#define restrict __restrict__\\n"
"\\n"
"typedef unsigned char uint8_t;\\n"
"typedef unsigned int uint32_t;\\n"
"typedef double ufc_scalar_t;\\n"
"\\n"
"extern \\"C\\" __global__\\n"
"void tabulate_tensor_{factory_name}({scalar_type}* restrict A,\\n"
" const {scalar_type}* restrict w,\\n"
" const {scalar_type}* restrict c,\\n"
" const {geom_type}* restrict coordinate_dofs,\\n"
" const int* restrict entity_local_index,\\n"
" const uint8_t* restrict quadrature_permutation\\n"
" )\\n"
"{{\\n"
"{tabulate_tensor_quoted}\\n"
"}}";
*num_program_headers = 0;
*program_headers = NULL;
*program_include_names = NULL;
*out_program_src = program_src;
*tabulate_tensor_function_name = "tabulate_tensor_{factory_name}";
}}

// End NVRTC CUDA wrapper for integral {factory_name}

"""


def get_factory(options):
"""Return the template string for constructing form integrals."""
if options.get("cuda_nvrtc"):
return cuda_wrapper + factory
else:
return factory
3 changes: 3 additions & 0 deletions ffcx/codegeneration/jit.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@
UFC_INTEGRAL_DECL += "\n".join(
re.findall(r"typedef void ?\(ufcx_tabulate_tensor_complex128\).*?\);", ufcx_h, re.DOTALL)
)
UFC_INTEGRAL_DECL += "\n".join(
re.findall(r"typedef void ?\(ufcx_tabulate_tensor_cuda_nvrtc\).*?\);", ufcx_h, re.DOTALL)
)

UFC_INTEGRAL_DECL += "\n".join(
re.findall("typedef struct ufcx_integral.*?ufcx_integral;", ufcx_h, re.DOTALL)
Expand Down
23 changes: 23 additions & 0 deletions ffcx/codegeneration/ufcx.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,28 @@ extern "C"
const uint8_t* restrict quadrature_permutation);
#endif // __STDC_NO_COMPLEX__

/// Return CUDA C++ source code for the ufc_tabulate_tensor kernel
/// The resulting source code is passed to NVRTC for runtime compilation
///
/// @param[out] num_program_headers
/// The number of headers required by the program
/// @param[out] program_headers
/// Entire contents of each header file
/// @param[out] program_include_names
/// Names of each header file
/// @param[out] program_src
/// CUDA C++ source code for the program containing the
/// tabulate_tensor function.
/// @param[out] tabulate_tensor_function_name
/// The name of the device-side function.
///
typedef void(ufcx_tabulate_tensor_cuda_nvrtc)(
int* num_program_headers,
const char*** program_headers,
const char*** program_include_names,
const char** program_src,
const char** tabulate_tensor_function_name);

typedef struct ufcx_integral
{
const bool* enabled_coefficients;
Expand All @@ -135,6 +157,7 @@ extern "C"
ufcx_tabulate_tensor_complex64* tabulate_tensor_complex64;
ufcx_tabulate_tensor_complex128* tabulate_tensor_complex128;
#endif // __STDC_NO_COMPLEX__
ufcx_tabulate_tensor_cuda_nvrtc* tabulate_tensor_cuda_nvrtc;
bool needs_facet_permutations;

/// Hash of the coordinate element associated with the geometry of the mesh.
Expand Down
6 changes: 6 additions & 0 deletions ffcx/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@
logger = logging.getLogger("ffcx")

FFCX_DEFAULT_OPTIONS = {
"cuda_nvrtc": (
bool,
False,
"generate CUDA wrapped versions of tabulate tensor functions for use with NVRTC",
None,
),
"epsilon": (float, 1e-14, "machine precision, used for dropping zero terms in tables.", None),
"scalar_type": (
str,
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ ffcx = "ffcx:__main__.main"
[project.optional-dependencies]
lint = ["ruff"]
docs = ["sphinx", "sphinx_rtd_theme"]
optional = ["numba", "pygraphviz==1.7"]
optional = ["numba", "pygraphviz==1.7", "nvidia-cuda-nvrtc-cu12"]
test = ["pytest >= 6.0", "sympy", "numba"]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also add Nvidia rtc here - the full set of tests should be able to execute installing this optional set.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the current CI, the optional dependencies are installed only on Linux, while the others are installed on all platforms. Unfortunately, sticking nvidia rtc in the test dependencies breaks the MacOS CI. Ideally we'd support all platforms, but I don't want to add CUDA Toolkit as a dependency and for the time being testing on Linux should suffice to ensure correctness of the generated CUDA wrapper code.

ci = [
"coveralls",
Expand Down
Loading