目标

Thrust 是一个极其强大的库，适用于各种 cuda 加速算法。但是，thrust 旨在与向量而不是间距矩阵一起使用。以下教程将讨论将 cv::cuda::GpuMat 包装到可以与 thrust 算法一起使用的 thrust 迭代器中。

本教程将向您展示如何

将 GpuMat 包装到 thrust 迭代器中
用随机数填充 GpuMat
就地对 GpuMat 的一列进行排序
将大于 0 的值复制到新的 GPU 矩阵
将流与 thrust 结合使用

将 GpuMat 包装到 thrust 迭代器中

以下代码将为 GpuMat 生成一个迭代器

/*
@Brief GpuMatBeginItr 返回一个 thrust 兼容的迭代器，指向 GPU mat 内存的开头。
@Param mat 是输入矩阵
@Param channel 是迭代器正在访问的矩阵通道。如果设置为 -1，迭代器将按顺序访问每个元素
*/
template<typename T>
thrust::permutation_iterator<thrust::device_ptr<T>, thrust::transform_iterator<step_functor<T>, thrust::counting_iterator<int>>> GpuMatBeginItr(cv::cuda::GpuMat mat, int channel = 0)
{
    if (channel == -1)
    {
mat = mat.reshape(1);
channel = 0;
    }
    CV_Assert(mat.depth() == cv::DataType<T>::depth);
    CV_Assert(channel < mat.channels());
    return thrust::make_permutation_iterator(thrust::device_pointer_cast(mat.ptr<T>(0) + channel),
thrust::make_transform_iterator(thrust::make_counting_iterator(0), step_functor<T>(mat.cols, mat.step / sizeof(T), mat.channels())));
}

/*
@Brief GpuMatEndItr 返回一个 thrust 兼容的迭代器，指向 GPU mat 内存的末尾。
@Param mat 是输入矩阵
@Param channel 是迭代器正在访问的矩阵通道。如果设置为 -1，迭代器将按顺序访问每个元素
*/
template<typename T>
thrust::permutation_iterator<thrust::device_ptr<T>, thrust::transform_iterator<step_functor<T>, thrust::counting_iterator<int>>> GpuMatEndItr(cv::cuda::GpuMat mat, int channel = 0)
{
    if (channel == -1)
    {
mat = mat.reshape(1);
channel = 0;
    }
    CV_Assert(mat.depth() == cv::DataType<T>::depth);
    CV_Assert(channel < mat.channels());
    return thrust::make_permutation_iterator(thrust::device_pointer_cast(mat.ptr<T>(0) + channel),
thrust::make_transform_iterator(thrust::make_counting_iterator(mat.rows*mat.cols), step_functor<T>(mat.cols, mat.step / sizeof(T), mat.channels())));
}

我们的目标是拥有一个迭代器，它将从矩阵的开头开始，并正确递增以访问连续的矩阵元素。对于连续行来说，这是微不足道的，但是对于间距矩阵的列来说呢？为此，我们需要迭代器了解矩阵的尺寸和步长。此信息嵌入在 step_functor 中。

template<typename T> struct step_functor : public thrust::unary_function<int, int>
{
    int 列；
    int 步长；
    int 通道；
__host__ __device__ step_functor(int columns_, int step_, int channels_ = 1) : columns(columns_), step(step_), channels(channels_) { };
__host__ step_functor(cv::cuda::GpuMat& mat)
    {
        CV_Assert(mat.depth() == cv::DataType<T>::depth);
columns = mat.cols;
step = mat.step / sizeof(T);
channels = mat.channels();
    }
__host__ __device__
        int operator()(int x) const
    {
        int row = x / columns;
        int idx = (row * step) + (x % columns)*channels;
        return idx;
    }
};

step functor 接受一个索引值，并返回与矩阵开头处的适当偏移量。计数迭代器只是在像素元素范围内递增。组合到 transform_iterator 中，我们有一个迭代器，它从 0 计数到 M*N，并正确递增以考虑 GpuMat 的间距内存。不幸的是，这不包括任何内存位置信息，为此我们需要一个 thrust::device_ptr。通过将设备指针与 transform_iterator 结合使用，我们可以将 thrust 指向矩阵的第一个元素，并使其相应地步进。

用随机数填充 GpuMat

现在我们有一些很好的函数来为 thrust 创建迭代器，让我们用它们做一些 OpenCV 做不到的事情。不幸的是，在撰写本文时，OpenCV 没有任何 Gpu 随机数生成。谢天谢地，thrust 有，现在在两者之间进行互操作非常简单。示例取自 http://stackoverflow.com/questions/12614164/generating-a-random-number-vector-between-0-and-1-0-using-thrust

首先，我们需要编写一个 functor，它将生成我们的随机值。

struct prg
{
float a, b;
 
__host__ __device__
prg(float _a = 0.f, float _b = 1.f) : a(_a), b(_b) {};
 
__host__ __device__
float operator()(const unsigned int n) const
  {
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> dist(a, b);
rng.discard(n);
return dist(rng);
  }
};

这将接受一个整数值，并输出一个介于 a 和 b 之间的值。现在我们将使用 thrust 变换用介于 0 和 10 之间的值填充我们的矩阵。

  {
cv::cuda::GpuMat d_value(1, 100, CV_32F);
auto valueBegin = GpuMatBeginItr<float>(d_value);
auto valueEnd = GpuMatEndItr<float>(d_value);
thrust::transform(thrust::make_counting_iterator(0), thrust::make_counting_iterator(d_value.cols), valueBegin, prg(-1, 1));
 
cv::Mat h_value(d_value);
  }

就地对 GpuMat 的一列进行排序

让我们用随机值和一个索引填充矩阵元素。之后，我们将对随机数和索引进行排序。

  {
cv::cuda::GpuMat d_data(1, 100, CV_32SC2);
// Thrust 兼容的 begin 和 end 迭代器，指向此矩阵的通道 1
auto keyBegin = GpuMatBeginItr<int>(d_data, 1);
auto keyEnd = GpuMatEndItr<int>(d_data, 1);
// Thrust 兼容的 begin 和 end 迭代器，指向此矩阵的通道 0
auto idxBegin = GpuMatBeginItr<int>(d_data, 0);
auto idxEnd = GpuMatEndItr<int>(d_data, 0);
// 用从 0 到 100 的数字序列填充索引通道
thrust::sequence(idxBegin, idxEnd);
// 用介于 0 和 10 之间的随机数填充键通道。此处使用计数迭代器为每个位置提供一个整数值，作为 prg::operator() 的输入
thrust::transform(thrust::make_counting_iterator(0), thrust::make_counting_iterator(d_data.cols), keyBegin, prg(0, 10));
// 对键通道和索引通道进行排序，以便键和索引保持在一起
thrust::sort_by_key(keyBegin, keyEnd, idxBegin);
 
cv::Mat h_idx(d_data);
  }

在使用流的同时将大于 0 的值复制到新的 GPU 矩阵

在此示例中，我们将了解如何将 cv::cuda::Streams 与 thrust 结合使用。不幸的是，此特定示例使用的函数必须将结果返回到 CPU，因此这不是流的最佳使用方式。

  {
cv::cuda::GpuMat d_value(1, 100, CV_32F);
auto valueBegin = GpuMatBeginItr<float>(d_value);
auto valueEnd = GpuMatEndItr<float>(d_value);
cv::cuda::Stream stream;
//! [random_gen_stream]
// 与之前的随机生成代码相同，只不过现在变换正在流上执行
thrust::transform(thrust::system::cuda::par.on(cv::cuda::StreamAccessor::getStream(stream)), thrust::make_counting_iterator(0), thrust::make_counting_iterator(d_value.cols), valueBegin, prg(-1, 1));
//! [random_gen_stream]
// 计算我们要复制的值的数量
int count = thrust::count_if(thrust::system::cuda::par.on(cv::cuda::StreamAccessor::getStream(stream)), valueBegin, valueEnd, pred_greater<float>(0.0));
// 为复制的值分配目标
cv::cuda::GpuMat d_valueGreater(1, count, CV_32F);
// 复制满足谓词的值。
thrust::copy_if(thrust::system::cuda::par.on(cv::cuda::StreamAccessor::getStream(stream)), valueBegin, valueEnd, GpuMatBeginItr<float>(d_valueGreater), pred_greater<float>(0.0));
cv::Mat h_greater(d_valueGreater);
  }

首先，我们将在流上用 -1 和 1 之间随机生成的数据填充 GPU mat。

// 与之前的随机生成代码相同，只不过现在变换正在流上执行

thrust::transform(thrust::system::cuda::par.on(cv::cuda::StreamAccessor::getStream(stream)), thrust::make_counting_iterator(0), thrust::make_counting_iterator(d_value.cols), valueBegin, prg(-1, 1));

请注意 thrust::system::cuda::par.on(...) 的使用，这会创建一个执行策略，用于在流上执行 thrust 代码。cuda 工具包随附的 thrust 版本中存在一个错误，截至 7.5 版本，此错误尚未修复。此错误会导致代码无法在流上执行。但是，可以通过使用来自 git 存储库的最新版本的 thrust 来修复此错误。(http://github.com/thrust/thrust.git) 接下来，我们将使用 thrust::count_if 和以下谓词来确定有多少值大于 0

template<typename T> struct pred_greater
{
T value;
__host__ __device__ pred_greater(T value_) : value(value_){}
__host__ __device__ bool operator()(const T& val) const
  {
return val > value;
  }
};

我们将使用这些结果来创建一个输出缓冲区来存储复制的值，然后我们将使用 copy_if 和相同的谓词来填充输出缓冲区。最后，我们将值下载到 CPU mat 中以进行查看。

目录

目标

将 GpuMat 包装到 thrust 迭代器中

用随机数填充 GpuMat

就地对 GpuMat 的一列进行排序

在使用流的同时将大于 0 的值复制到新的 GPU 矩阵