CUDAでAO bench

#graphics

AO benchにはCUDA版がないじゃないか！ということで手を出してみる。

cudaMallocでGPUのメモリを確保、cudaMemcpyでメモリコピー、GPUプログラムの呼び出しは func<<<grid, block>>>() でいけるらしい。 GPU側では threadIdx や blockIdx を使って並列にアクセスする。ベクトル型は float4 が使える。

ということで実行結果：0.320秒

同じ環境でC言語版が1.756秒だったので、CPUシングルスレッドに比べ5.49倍の高速化
- 計測はプロセスの起動や画像の保存なども含んでいるので、精度はいまいち
NVIDIA Quadro FX 570
- Intel(R) Core(TM)2 Duo CPU, E8400 @ 3.00GHz, 2.99 GHz, 2.00 GB RAM

以下ソース：

// includes, system
#include <stdio.h>

// includes, project
#include <cutil_inline.h>
#include <cuda.h>

#include "vec.h"

const int WIDTH  = 256;
const int HEIGHT = 256;
const int NSUBSAMPLES = 2;
const int NAO_SAMPLES = 8;

__device__ const float M_PI = 3.1415926535;
__device__ const float NoHit = 1e16;

typedef struct {
    vec center;
    float radius;
} Sphere;

typedef struct {
    float a, b, c, d;
} Plane;

typedef struct {
    float t;
    vec p;
    vec n;
} Isect;


__device__ __host__ float sq(float x) { return x * x; }

__device__ __host__ void ray_sphere_intersect(Isect* isect, const vec* orig, const vec* dir, const vec* sphere_center, float sphere_radius) {
    vec v;
    vsub(&v, orig, sphere_center);
    float dd = sq(vdot(&v, dir)) - (vsqlength(&v) - sq(sphere_radius));
    if (dd >= 0) {
        float sqdd = sqrt(dd);
        float t1 = -vdot(&v, dir) - sqdd;
        if (0 < t1 && t1 < isect->t) {
            isect->t = t1;

            vec tmp;
            vadd(&isect->p, orig, vscale(&tmp, dir, t1));

            vnormalize(vsub(&isect->n, &isect->p, sphere_center));
        }
    }
}

__device__ __host__ void ray_plane_intersect(Isect* isect, const vec* orig, const vec* dir, float a, float b, float c, float d) {
    float h = orig->x * a + orig->y * b + orig->z * c + d;
    if (h >= 0) {
        float dot = dir->x * a + dir->y * b + dir->z * c;
        if (dot < 0) {
            float t = h / -dot;
            if (t < isect->t) {
                isect->t = t;

                vec tmp;
                vadd(&isect->p, orig, vscale(&tmp, dir, t));

                isect->n.x = a;
                isect->n.y = b;
                isect->n.z = c;
            }
        }
    }
}

__device__ __host__ void orthoBasis(vec* basis, const vec& n) {
    basis[2] = n;
    basis[1].x = basis[1].y = basis[1].z = 0;

    if (n.x < 0.6 && n.x > -0.6) {
        basis[1].x = 1;
    } else if (n.y < 0.6 && n.y > -0.6) {
        basis[1].y = 1;
    } else if (n.z < 0.6 && n.z > -0.6) {
        basis[1].z = 1;
    } else {
        basis[1].x = 1;
    }

    vnormalize(vcross(&basis[0], &basis[1], &basis[2]));
    vnormalize(vcross(&basis[1], &basis[2], &basis[0]));
}


__device__ const Sphere spheres[3] = {
    {    { -2.0, 0.0, -3.5 }, 0.5    },
    {    { -0.5, 0.0, -3.0 }, 0.5    },
    {    {  1.0, 0.0, -2.2 }, 0.5    },
};

__device__ const Plane plane = { 0, 1, 0, 0.5 };


__device__ __host__ void ambient_occlusion(vec* col, const Isect* isect, float* rands) {
    const float eps = 0.0001;
    const int ntheta = NAO_SAMPLES;
    const int nphi   = NAO_SAMPLES;

    vec p;
    vadd(&p, &isect->p, vscale(&p, &isect->n, eps));

    vec basis[3];
    orthoBasis(basis, isect->n);

    int occludeCount = 0;
    for (int i=0; i < ntheta * nphi; ++i) {
        float theta = sqrt(rands[i*2+0]);
        float phi   = 2 * M_PI * rands[i*2+1];

        float x = cos(phi) * theta;
        float y = sin(phi) * theta;
        float z = sqrt(1 - theta * theta);

        // local -> global
        float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
        float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
        float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;

        vec raydir = { rx, ry, rz };
        Isect occIsect;
        occIsect.t = NoHit;
        ray_sphere_intersect(&occIsect, &p, &raydir, &spheres[0].center, spheres[0].radius);
        ray_sphere_intersect(&occIsect, &p, &raydir, &spheres[1].center, spheres[1].radius);
        ray_sphere_intersect(&occIsect, &p, &raydir, &spheres[2].center, spheres[2].radius);
        ray_plane_intersect(&occIsect, &p, &raydir, plane.a, plane.b, plane.c, plane.d);
        if (occIsect.t < NoHit)    occludeCount += 1;
    }

    float c = (float)(ntheta * nphi - occludeCount) / (ntheta * nphi);
    col->x = c;
    col->y = c;
    col->z = c;
}

// kernel
__global__ void render(float* fimg, float* rands) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x < WIDTH && y < HEIGHT) {
        vec col = { 0, 0, 0 };
        for (int v=0; v<NSUBSAMPLES; ++v) {
            for (int u=0; u<NSUBSAMPLES; ++u) {
                vec raydir;
                raydir.x = (x + (u / (float)NSUBSAMPLES) - (WIDTH / 2.0)) / (WIDTH / 2.0);
                raydir.y = -(y + (v / (float)NSUBSAMPLES) - (HEIGHT / 2.0)) / (HEIGHT / 2.0);
                raydir.z = -1;
                vnormalize(&raydir);

                const vec eyepos = { 0, 0, 0 };

                Isect isect;
                isect.t = NoHit;
                ray_sphere_intersect(&isect, &eyepos, &raydir, &spheres[0].center, spheres[0].radius);
                ray_sphere_intersect(&isect, &eyepos, &raydir, &spheres[1].center, spheres[1].radius);
                ray_sphere_intersect(&isect, &eyepos, &raydir, &spheres[2].center, spheres[2].radius);
                ray_plane_intersect(&isect, &eyepos, &raydir, plane.a, plane.b, plane.c, plane.d);
                if (isect.t < NoHit) {
                    vec c;
                    ambient_occlusion(&c, &isect, rands);
                    vadd(&col, &col, &c);
                }
            }
        }

        fimg[(y*WIDTH+x)*3+0] = col.x * (1.0 / sq(NSUBSAMPLES));
        fimg[(y*WIDTH+x)*3+1] = col.y * (1.0 / sq(NSUBSAMPLES));
        fimg[(y*WIDTH+x)*3+2] = col.z * (1.0 / sq(NSUBSAMPLES));
    }
}

float clamp(float x, float min, float max) {
    return x < min ? min : x > max ? max : x;
}

bool saveppm(const char* fn, int w, int h, const unsigned char* img) {
    FILE* fp = fopen(fn, "wb");
    if (fp == NULL) {
        return false;
    } else {
        fprintf(fp,"P6\n%d %d\n%d\n", w, h, 255);
        fwrite(img, w * h * 3, 1, fp);
        return true;
    }
}

int main() {
    float* d_fimg;
    float* d_rands;

    // allocate device memory
    cutilSafeCall(cudaMalloc((void**) &d_fimg, sizeof(float) * (WIDTH * HEIGHT) * 3));
    cutilSafeCall(cudaMalloc((void**) &d_rands, sizeof(float) * NAO_SAMPLES * NAO_SAMPLES * 2));

    {    // 乱数テーブル
        float h_rands[NAO_SAMPLES * NAO_SAMPLES * 2];
        for (int i=0; i<NAO_SAMPLES * NAO_SAMPLES * 2; ++i)    h_rands[i] = (float)rand() / (float)RAND_MAX;
        // copy host memory to device
        cutilSafeCall( cudaMemcpy( d_rands, h_rands, sizeof(h_rands), cudaMemcpyHostToDevice) );
    }

    dim3 dimBlock(16, 16);
    dim3 dimGrid( (WIDTH  + dimBlock.x - 1) / dimBlock.x, (HEIGHT + dimBlock.y - 1) / dimBlock.y);
    // execute the kernel
    render<<<dimGrid, dimBlock>>>(d_fimg, d_rands);

    // check if kernel execution generated and error
    cutilCheckMsg("Kernel execution failed");

    // 結果取り出し
    {
        const int N = WIDTH * HEIGHT * 3;
        float* result = new float[N];
        // copy result from device to host
        cutilSafeCall( cudaMemcpy( result, d_fimg, sizeof(float) * N, cudaMemcpyDeviceToHost) );

        unsigned char* img = new unsigned char[N];
        for (int i=0; i<N; ++i)    img[i] = (unsigned char)(clamp(result[i], 0, 1) * 255.99f);
        saveppm("ao.ppm", WIDTH, HEIGHT, img);

        delete[] result;
        delete[] img;
    }

    // cleanup memory
    cutilSafeCall(cudaFree(d_fimg));
    cutilSafeCall(cudaFree(d_rands));
}

vec.h

typedef float4 vec;

/* return vector sum c = a+b */
__device__ __host__ vec* vadd(vec* dst, const vec* v1, const vec* v2) {
    dst->x = v1->x + v2->x;
    dst->y = v1->y + v2->y;
    dst->z = v1->z + v2->z;
    return dst;
}

/* return vector difference c = a-b */
__device__ __host__ vec* vsub(vec* dst, const vec* v1, const vec* v2) {
    dst->x = v1->x - v2->x;
    dst->y = v1->y - v2->y;
    dst->z = v1->z - v2->z;
    return dst;
}

/* return vector scale c = a*s */
__device__ __host__ vec* vscale(vec* dst, const vec* v, float s) {
    dst->x = v->x * s;
    dst->y = v->y * s;
    dst->z = v->z * s;
    return dst;
}

/* return the dot product of vectors a and b */
__device__ __host__ float vdot(const vec* a, const vec* b) {
    return a->x * b->x + a->y * b->y + a->z * b->z;
}

/* return the cross product of vectors a and b */
__device__ __host__ vec* vcross(vec* dst, const vec* a, const vec* b) {
    dst->x = a->y * b->z - a->z * b->y;
    dst->y = a->z * b->x - a->x * b->z;
    dst->z = a->x * b->y - a->y * b->x;
    return dst;
}

/* returns squared length of input vector */
__device__ __host__ float vsqlength(const vec* v) {
    return v->x * v->x + v->y * v->y + v->z * v->z;
}

/* returns length of input vector */
__device__ __host__ float vlength(const vec* v) {
    return sqrt(vsqlength(v));
}

/* normalizes the input vector and returns it */
__device__ __host__ vec* vnormalize(vec* v) {
    float len = vlength(v);
    if (len != 0.0) {
        v->x /= len;
        v->y /= len;
        v->z /= len;
    }
    return v;
}

ほとんどC版そのままでいける
- 拡張子を.cuにして、Cの関数とCUDAの関数などを一緒くたに書ける、どうなってるの？
- 関数名の頭に「device host」を追加したくらい
乱数の作り方がわからなかったので、固定の乱数テーブルを使うことにしたら残念な結果に→
- メルセンヌツイスタを使ったら？との話
- shared memoryとか、よくわかってない
3fpsなら、十分リアルタイムとうたえるよね
- GPUAOは1024x768で10msくらいとのこと、速いなぁ
0.0とか書いて、倍精度になってないかちと心配…
なんかこう、並列化するところを自分で考えてデータ構造考えて転送して書き戻して…とかメンドイ。OpenMPとかparallel_forとか書くだけで速くなる、とかが楽でいいな
実際にGPUでどんな風に実行されてるのか、皆目検討つかないぜ
- もっと低レベルな状態で触らせて欲しい

CUDAでAO bench

参考

新着記事

参考

関連記事

Processing(Java)で.objファイル読み込み

AABBTreeを組み込んで、レイトレでポリゴン描画

Processing(Java)で.plyファイル読み込み

双方向パストレーシング

kd-treeの構築と近傍要素の抽出

新着記事