问题 cuda共享库链接：未定义引用cudaRegisterLinkedBinary

目标：

创建一个包含我的CUDA内核的共享库，该内核具有一个不含CUDA的包装器/头。
创建一个 test 共享库的可执行文件。

问题

共享库 MYLIB.so 似乎编译得很好。（没问题）。
链接错误：

./libMYLIB.so: undefined reference to __cudaRegisterLinkedBinary_39_tmpxft_000018cf_00000000_6_MYLIB_cpp1_ii_74c599a1

简化的makefile：

libMYlib.so :  MYLIB.o
    g++  -shared  -Wl,-soname,libMYLIB.so  -o libMYLIB.so    MYLIB.o  -L/the/cuda/lib/dir  -lcudart


MYLIB.o : MYLIB.cu   MYLIB.h
    nvcc  -m64   -arch=sm_20 -dc  -Xcompiler '-fPIC'  MYLIB.cu  -o  MYLIB.o  -L/the/cuda/lib/dir  -lcudart


test : test.cpp  libMYlib.so
        g++   test.cpp  -o test  -L.  -ldl -Wl,-rpath,.   -lMYLIB  -L/the/cuda/lib/dir  -lcudart

确实

nm libMYLIB.so 表明所有 CUDA api函数是“未定义的符号”：

         U __cudaRegisterFunction
         U __cudaRegisterLinkedBinary_39_tmpxft_0000598c_00000000_6_CUPA_cpp1_ii_74c599a1
         U cudaEventRecord
         U cudaFree
         U cudaGetDevice
         U cudaGetDeviceProperties
         U cudaGetErrorString
         U cudaLaunch
         U cudaMalloc
         U cudaMemcpy

所以CUDA不知何故没有链接到共享库MYLIB.so 我错过了什么？

CUDA甚至没有以某种方式链接到目标文件：

nm MYLIB.o

         U __cudaRegisterFunction
         U __cudaRegisterLinkedBinary_39_tmpxft_0000598c_00000000_6_CUPA_cpp1_ii_74c599a1
         U cudaEventRecord
         U cudaFree
         U cudaGetDevice
         U cudaGetDeviceProperties
         U cudaGetErrorString
         U cudaLaunch
         U cudaMalloc
         U cudaMemcpy

（与上述相同）

这是一个示例linux共享对象创建沿您指示的行：

创建一个包含我的CUDA内核的共享库无CUDA包装/标题。
为共享库创建测试可执行文件。

首先是共享库。这个构建命令如下：

nvcc -arch=sm_20 -Xcompiler '-fPIC' -dc test1.cu test2.cu
nvcc -arch=sm_20 -Xcompiler '-fPIC' -dlink test1.o test2.o -o link.o
g++ -shared -o test.so test1.o test2.o link.o -L/usr/local/cuda/lib64 -lcudart

您的makefile中可能缺少上面的第二步，但我没有分析您的makefile是否存在任何其他问题。

现在，对于测试可执行文件，构建命令如下：

g++ -c main.cpp
g++ -o testmain main.o test.so

要运行它，只需执行 testmain 可执行，但请确保 test.so 图书馆在你的 LD_LIBRARY_PATH。

这些是我用于测试目的的文件：

test1.h：

int my_test_func1();

test1.cu：

#include <stdio.h>
#include "test1.h"

#define DSIZE 1024
#define DVAL 10
#define nTPB 256

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

__global__ void my_kernel1(int *data){
  int idx = threadIdx.x + (blockDim.x *blockIdx.x);
  if (idx < DSIZE) data[idx] =+ DVAL;
}

int my_test_func1(){

  int *d_data, *h_data;
  h_data = (int *) malloc(DSIZE * sizeof(int));
  if (h_data == 0) {printf("malloc fail\n"); exit(1);}
  cudaMalloc((void **)&d_data, DSIZE * sizeof(int));
  cudaCheckErrors("cudaMalloc fail");
  for (int i = 0; i < DSIZE; i++) h_data[i] = 0;
  cudaMemcpy(d_data, h_data, DSIZE * sizeof(int), cudaMemcpyHostToDevice);
  cudaCheckErrors("cudaMemcpy fail");
  my_kernel1<<<((DSIZE+nTPB-1)/nTPB), nTPB>>>(d_data);
  cudaDeviceSynchronize();
  cudaCheckErrors("kernel");
  cudaMemcpy(h_data, d_data, DSIZE * sizeof(int), cudaMemcpyDeviceToHost);
  cudaCheckErrors("cudaMemcpy 2");
  for (int i = 0; i < DSIZE; i++)
    if (h_data[i] != DVAL) {printf("Results check failed at offset %d, data was: %d, should be %d\n", i, h_data[i], DVAL); exit(1);}
  printf("Results check 1 passed!\n");
  return 0;
}

test2.h：

int my_test_func2();

test2.cu：

#include <stdio.h>
#include "test2.h"

#define DSIZE 1024
#define DVAL 20
#define nTPB 256

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

__global__ void my_kernel2(int *data){
  int idx = threadIdx.x + (blockDim.x *blockIdx.x);
  if (idx < DSIZE) data[idx] =+ DVAL;
}

int my_test_func2(){

  int *d_data, *h_data;
  h_data = (int *) malloc(DSIZE * sizeof(int));
  if (h_data == 0) {printf("malloc fail\n"); exit(1);}
  cudaMalloc((void **)&d_data, DSIZE * sizeof(int));
  cudaCheckErrors("cudaMalloc fail");
  for (int i = 0; i < DSIZE; i++) h_data[i] = 0;
  cudaMemcpy(d_data, h_data, DSIZE * sizeof(int), cudaMemcpyHostToDevice);
  cudaCheckErrors("cudaMemcpy fail");
  my_kernel2<<<((DSIZE+nTPB-1)/nTPB), nTPB>>>(d_data);
  cudaDeviceSynchronize();
  cudaCheckErrors("kernel");
  cudaMemcpy(h_data, d_data, DSIZE * sizeof(int), cudaMemcpyDeviceToHost);
  cudaCheckErrors("cudaMemcpy 2");
  for (int i = 0; i < DSIZE; i++)
    if (h_data[i] != DVAL) {printf("Results check failed at offset %d, data was: %d, should be %d\n", i, h_data[i], DVAL); exit(1);}
  printf("Results check 2 passed!\n");
  return 0;
}

main.cpp中：

#include <stdio.h>

#include "test1.h"
#include "test2.h"

int main(){

  my_test_func1();
  my_test_func2();
  return 0;
}

当我根据给出的命令编译，然后运行 ./testmain 我明白了：

$ ./testmain
Results check 1 passed!
Results check 2 passed!

请注意，如果您愿意，可以生成一个 libtest.so 代替 test.so，然后您可以使用修改后的构建序列作为测试可执行文件：

g++ -c main.cpp
g++ -o testmain main.o -L. -ltest

我不相信它有任何区别，但它可能是更熟悉的语法。

我确信有多种方法可以实现这一目标。这只是一个例子。您可能还希望查看相关部分 nvcc手册并审查例子。

编辑： 我在cuda 5.5 RC下进行了测试，最后的应用程序链接步骤抱怨找不到cudart lib（warning: libcudart.so.5.5., needed by ./libtest.so, not found）。但是，以下相对简单的修改（例如Makefile）应该在cuda 5.0或cuda 5.5下工作。

Makefile文件：

testmain : main.cpp  libtest.so
        g++ -c main.cpp
        g++ -o testmain  -L.  -ldl -Wl,-rpath,.   -ltest -L/usr/local/cuda/lib64 -lcudart main.o

libtest.so : link.o
        g++  -shared -Wl,-soname,libtest.so -o libtest.so    test1.o test2.o link.o  -L/usr/local/cuda/lib64  -lcudart

link.o : test1.cu test2.cu   test1.h test2.h
        nvcc  -m64   -arch=sm_20 -dc  -Xcompiler '-fPIC'  test1.cu test2.cu
        nvcc  -m64   -arch=sm_20 -Xcompiler '-fPIC' -dlink test1.o test2.o -o link.o

clean :
        rm -f testmain test1.o test2.o link.o libtest.so main.o

您是否尝试过显式禁用可重定位设备代码？即 -rdc=false？我懂了 undefined reference to __cudaRegisterLinkedBinaryWhatever 同 -rdc=true 当我取下它时它就消失了。虽然我不足以解释究竟是怎么回事。

没有静态版本的cuda运行时库，因此您不应期望在对象或共享库中静态地包含运行时库符号，因此您的最后两个编辑/添加内容是红色的。 - talonmies

好的，我不知道，好点。 - cmo

@talonmies实际上从CUDA Toolkit 5.5开始，还有一个静态版本的CUDA Runtime库 - RoBiK

问题仍然存在。按照你的例子，一切都顺利编译，直到最后一步 - 创建测试可执行文件，此时 __cudaRegisterLinkedBinary_39_tmpxft ...如前所述，抛出错误。 - cmo

我不确定问题是什么。它似乎对我来说很完美。您是否正在遵循我的步骤并准确使用我的文件？你在使用cuda 5.0吗？ - Robert Crovella

@MatthewParks我和__cudaRegisterLinkedBinary_39_tmpxft有同样的问题...，你解决了吗？ - Farzad Salimi Jazi

这适用于CUDA 7.0 - Rupert Nash

问题 cuda共享库链接：未定义引用cudaRegisterLinkedBinary

答案:

热门问题