I am following Jeremy's tutorial on building/writing CUDA code for rgb->gray scale and followed his notebook, but am failing when calling module = load_cuda(cuda_src, cpp_src, ['rgb_to_grayscale'], verbose=True)
Using /hdd4/srinath2/.cache/torch_extensions/py312_cu121 as PyTorch extensions root...
Creating extension directory /hdd4/srinath2/.cache/torch_extensions/py312_cu121/inline_ext...
Detected CUDA files, patching ldflags
Emitting ninja build file /hdd4/srinath2/.cache/torch_extensions/py312_cu121/inline_ext/build.ninja...
Building extension module inline_ext...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
[1/3] /usr/bin/nvcc --generate-dependencies-with-compile --dependency-output cuda.cuda.o.d -DTORCH_EXTENSION_NAME=inline_ext -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /hdd4/srinath2/.conda/envs/llm_env/lib/python3.12/site-packages/torch/include -isystem /hdd4/srinath2/.conda/envs/llm_env/lib/python3.12/site-packages/torch/include/torch/csrc/api/include -isystem /hdd4/srinath2/.conda/envs/llm_env/lib/python3.12/site-packages/torch/include/TH -isystem /hdd4/srinath2/.conda/envs/llm_env/lib/python3.12/site-packages/torch/include/THC -isystem /hdd4/srinath2/.conda/envs/llm_env/include/python3.12 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_86,code=sm_86 --compiler-options '-fPIC' -std=c++17 -c /hdd4/srinath2/.cache/torch_extensions/py312_cu121/inline_ext/cuda.cu -o cuda.cuda.o
FAILED: cuda.cuda.o
/usr/bin/nvcc --generate-dependencies-with-compile --dependency-output cuda.cuda.o.d -DTORCH_EXTENSION_NAME=inline_ext -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /hdd4/srinath2/.conda/envs/llm_env/lib/python3.12/site-packages/torch/include -isystem /hdd4/srinath2/.conda/envs/llm_env/lib/python3.12/site-packages/torch/include/torch/csrc/api/include -isystem /hdd4/srinath2/.conda/envs/llm_env/lib/python3.12/site-packages/torch/include/TH -isystem /hdd4/srinath2/.conda/envs/llm_env/lib/python3.12/site-packages/torch/include/THC -isystem /hdd4/srinath2/.conda/envs/llm_env/include/python3.12 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_86,code=sm_86 --compiler-options '-fPIC' -std=c++17 -c /hdd4/srinath2/.cache/torch_extensions/py312_cu121/inline_ext/cuda.cu -o cuda.cuda.o
cc1plus: fatal error: cuda_runtime.h: No such file or directory
compilation terminated.
[2/3] c++ -MMD -MF main.o.d -DTORCH_EXTENSION_NAME=inline_ext -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /hdd4/srinath2/.conda/envs/llm_env/lib/python3.12/site-packages/torch/include -isystem /hdd4/srinath2/.conda/envs/llm_env/lib/python3.12/site-packages/torch/include/torch/csrc/api/include -isystem /hdd4/srinath2/.conda/envs/llm_env/lib/python3.12/site-packages/torch/include/TH -isystem /hdd4/srinath2/.conda/envs/llm_env/lib/python3.12/site-packages/torch/include/THC -isystem /hdd4/srinath2/.conda/envs/llm_env/include/python3.12 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -c /hdd4/srinath2/.cache/torch_extensions/py312_cu121/inline_ext/main.cpp -o main.o
ninja: build stopped: subcommand failed.
Output exceeds the [size limit](command:workbench.action.openSettings?[). Open the full output data [in a text editor](command:workbench.action.openLargeOutput?a2bafd1a-0c2d-45d8-a6c5-9afaf39345f4)
---------------------------------------------------------------------------
CalledProcessError Traceback (most recent call last)
File ~/.conda/envs/llm_env/lib/python3.12/site-packages/torch/utils/cpp_extension.py:2096, in _run_ninja_build(build_directory, verbose, error_prefix)
2095 stdout_fileno = 1
-> 2096 subprocess.run(
2097 command,
2098 stdout=stdout_fileno if verbose else subprocess.PIPE,
2099 stderr=subprocess.STDOUT,
2100 cwd=build_directory,
2101 check=True,
2102 env=env)
2103 except subprocess.CalledProcessError as e:
2104 # Python 2 and 3 compatible way of getting the error object.
File ~/.conda/envs/llm_env/lib/python3.12/subprocess.py:571, in run(input, capture_output, timeout, check, *popenargs, **kwargs)
570 if check and retcode:
--> 571 raise CalledProcessError(retcode, process.args,
572 output=stdout, stderr=stderr)
573 return CompletedProcess(process.args, retcode, stdout, stderr)
CalledProcessError: Command '['ninja', '-v']' returned non-zero exit status 1.
The above exception was the direct cause of the following exception:
RuntimeError Traceback (most recent call last)
...
2110 if hasattr(error, 'output') and error.output: # type: ignore[union-attr]
2111 message += f": {error.output.decode(*SUBPROCESS_DECODE_ARGS)}" # type: ignore[union-attr]
-> 2112 raise RuntimeError(message) from e
RuntimeError: Error building extension 'inline_ext'
Please let me know how to debug/proceed further.