python - 在 Tensorflow 中添加 GPU Op

标签 python c++ cuda tensorflow

我正尝试在 this 之后松散地向 TensorFlow 添加一个新操作文档。不同之处在于我正在尝试实现基于 GPU 的操作。我要添加的操作是来自 here 的 cuda 操作(cuda_op.py、cuda_op_kernel.cc、cuda_op_kernel.cu.cc)。我正在尝试在 tensorflow 之外编译这些并使用 tf.load_op_library把它们拉进来。我做了一些更改,所以这是我的文件:

cuda_op_kernel.cc

#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/shape_inference.h"
#include "tensorflow/core/framework/op_kernel.h"

using namespace tensorflow;  // NOLINT(build/namespaces)

REGISTER_OP("AddOne")
    .Input("input: int32")
    .Output("output: int32")
    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
      c->set_output(0, c->input(0));
      return Status::OK();
    });

void AddOneKernelLauncher(const int* in, const int N, int* out);

class AddOneOp : public OpKernel {
 public:
  explicit AddOneOp(OpKernelConstruction* context) : OpKernel(context) {}

  void Compute(OpKernelContext* context) override {
    // Grab the input tensor
    const Tensor& input_tensor = context->input(0);
    auto input = input_tensor.flat<int32>();

    // Create an output tensor
    Tensor* output_tensor = NULL;
    OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
                                                     &output_tensor));
    auto output = output_tensor->template flat<int32>();

    // Set all but the first element of the output tensor to 0.
    const int N = input.size();
    // Call the cuda kernel launcher
    AddOneKernelLauncher(input.data(), N, output.data());

  }
};

REGISTER_KERNEL_BUILDER(Name("AddOne").Device(DEVICE_GPU), AddOneOp);

cuda_op_kernel.cu

#define EIGEN_USE_GPU
#include <cuda.h>
#include <stdio.h>

__global__ void AddOneKernel(const int* in, const int N, int* out) {
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
       i += blockDim.x * gridDim.x) {
    out[i] = in[i] + 1;
  }
}

void AddOneKernelLauncher(const int* in, const int N, int* out) {
  AddOneKernel<<<32, 256>>>(in, N, out);

  cudaError_t cudaerr = cudaDeviceSynchronize();
  if (cudaerr != cudaSuccess)
    printf("kernel launch failed with error \"%s\".\n", cudaGetErrorString(cudaerr));
}

CMakeLists.txt

cmake_minimum_required(VERSION 3.5)

#found from running python -c 'import tensorflow as tf; print(tf.sysconfig.get_include())'
include_directories(/usr/local/lib/python3.5/dist-packages/tensorflow/include)

find_package(CUDA)

#set flags based on tutorial
set (CMAKE_CXX_FLAGS "--std=c++11 -fPIC -O2 -D_GLIBCXX_USE_CXX11_ABI=0")

#pass flags to c++ compiler
SET(CUDA_PROPAGATE_HOST_FLAGS ON)

#create library
cuda_add_library(
    cuda_op SHARED
    src/cuda_op_kernel.cu
    src/cuda_op_kernel.cc
    OPTIONS -gencode=arch=compute_20,code=sm_20)

#copy test file to build folder
configure_file(src/test.py test.py COPYONLY)

测试.py

import tensorflow as tf
mod = tf.load_op_library('./libcuda_op.so')
with tf.Session() as sess:
    start = [5,4,3,2,1]
    print(start)
    print(mod.add_one(start).eval())

我能够编译和运行 test.py成功,但输出始终是 [0 0 0 0 0] .如果我更换 AddOneKernel<<<32, 256>>>(in, N, out);for (int i = 0; i < N; i++) out[i] = in[i] + 1;DEVICE_GPUDEVICE_CPU , op 输出正确的值 [6 5 4 3 2] (与 CMakeList.txt 完全相同)。

知道如何获得正确的返回值吗?

最佳答案

我不完全记得我在哪里找到了 CUDA 的 cmake 东西,但这些选项不知何故弄乱了编译。将 CMakeLists.txt 中的 cuda_add_library 替换为以下内容修复了问题。

#no options needed
cuda_add_library(
    cuda_op SHARED
    src/cuda_op_kernel.cu
    src/cuda_op_kernel.cc)

关于python - 在 Tensorflow 中添加 GPU Op,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/44403127/

相关文章:

c++ - 将重载运算符重构为非成员函数是否会破坏任何代码?

haskell - 如何在加速示例包的示例上启用 CUDA?

macos - nvcc 未知选项 -no_pie

CUDA 共享内存阵列 - 奇怪的行为

python - 期望 volvo2d_input_1 有 4 个维度,但得到形状为 (150, 150, 1) 的数组

c++ - 如何使用 C++ 实现 native 脚本

c++ - 类构造函数中的类型转换不正确

python - 用 Python 自动化无聊的事情 - Collat​​z Sequence

python - Python 和 Scala 程序之间的进程间通信

python - VTK : how to read grid cells' lenth, 宽度和高度?