Thrust架构简介

Thrust是一个高层次的C++模板库，它封装了CUDA C++ API的基本操作，提供了与STL类似的容器、算法和迭代器。

一、Thrust基础操作

Thrust的基础操作主要包括向量(vector)、迭代器(iterator)和算法(algorithm)。

1、向量

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>

int main() {
    // 创建大小为3的CPU向量，元素都是0
    thrust::host_vector<int> hvec(3);

    // 创建大小为3的GPU向量，元素都是0
    thrust::device_vector<int> dvec(3);

    // 向CPU向量中填充随机数
    thrust::generate(hvec.begin(), hvec.end(), rand);

    // 将CPU向量拷贝到GPU向量中
    dvec = hvec;

    return 0;
}

2、迭代器

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>

int main() {
    // 创建大小为5的CPU向量
    thrust::host_vector<int> hvec(5);

    // 将0,1,2,3,4填充到CPU向量中
    thrust::counting_iterator<int> first(0);
    thrust::copy_n(first, 5, hvec.begin());

    return 0;
}

3、算法

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sequence.h>

int main() {
    // 创建大小为5的GPU向量，元素都是0
    thrust::device_vector<int> dvec(5);

    // 将1,2,3,4,5填充到GPU向量中
    thrust::sequence(dvec.begin(), dvec.end());

    return 0;
}

二、Thrust高级操作

Thrust的高级操作主要包括重载运算符、自定义函数、并行算法和批处理操作。

1、重载运算符

#include <thrust/device_vector.h>

struct Point {
    int x;
    int y;

    __host__ __device__
    Point operator+(const Point& other) const {
        Point result;
        result.x = x + other.x;
        result.y = y + other.y;
        return result;
    }
};

int main() {
    // 创建大小为3的GPU向量
    thrust::device_vector<Point> dvec(3);

    // 向GPU向量中填充随机点
    thrust::generate(dvec.begin(), dvec.end(), [] () {
        Point p;
        p.x = rand();
        p.y = rand();
        return p;
    });

    // 计算两个点的元素和
    Point sum = dvec[0] + dvec[1];

    return 0;
}

2、自定义函数

#include <thrust/device_vector.h>
#include <thrust/functional.h>

struct saxpy_functor {
    const float a;

    saxpy_functor(float _a) : a(_a) {}

    __host__ __device__
    float operator()(const float& x, const float& y) const {
        return a * x + y;
    }
};

int main() {
    // 创建大小为3的GPU向量，元素都是1.0
    thrust::device_vector<float> x(3, 1.0f);
    thrust::device_vector<float> y(3, 2.0f);

    // 将x中的每个元素乘以2，再将y中的每个元素加上3
    thrust::transform(x.begin(), x.end(), y.begin(), y.begin(), saxpy_functor(2.0f));

    return 0;
}

3、并行算法

#include <thrust/device_vector.h>
#include <thrust/sort.h>

int main() {
    // 创建大小为5的GPU向量，元素都是0~4的随机排列
    thrust::device_vector<int> dvec(5);
    thrust::sequence(dvec.begin(), dvec.end());
    thrust::random_shuffle(dvec.begin(), dvec.end());

    // 对GPU向量中的元素进行排序
    thrust::sort(dvec.begin(), dvec.end());

    return 0;
}

4、批处理操作

#include <thrust/device_vector.h>
#include <thrust/sequence.h>
#include <thrust/iterator/zip_iterator.h>

int main() {
    // 创建大小为3的GPU向量
    thrust::device_vector<int> a(3);
    thrust::device_vector<int> b(3);
    thrust::device_vector<int> c(3);

    // 将1,2,3填充到a中，将4,5,6填充到b中，将7,8,9填充到c中
    thrust::sequence(a.begin(), a.end(), 1);
    thrust::sequence(b.begin(), b.end(), 4);
    thrust::sequence(c.begin(), c.end(), 7);

    // 将三个向量的迭代器zip在一起
    thrust::zip_iterator<thrust::tuple<thrust::device_vector<int>::iterator, thrust::device_vector<int>::iterator, thrust::device_vector<int>::iterator> > zip_begin = thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin(), c.begin()));
    thrust::zip_iterator<thrust::tuple<thrust::device_vector<int>::iterator, thrust::device_vector<int>::iterator, thrust::device_vector<int>::iterator> > zip_end = thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end(), c.end()));

    // 将每个元素变为原来的平方
    thrust::for_each(zip_begin, zip_end, [] (thrust::tuple<int&, int&, int> t) {
        int& x = thrust::get<0>(t);
        int& y = thrust::get<1>(t);
        int& z = thrust::get<2>(t);

        x = x * x;
        y = y * y;
        z = z * z;
    });

    return 0;
}