CUDAでAO bench

2009-09-11

AO benchにはCUDA版がないじゃないか!ということで手を出してみる。

cudaMallocでGPUのメモリを確保、cudaMemcpyでメモリコピー、GPUプログラムの呼び出しは func<<<grid, block>>>() でいけるらしい。 GPU側では threadIdxblockIdx を使って並列にアクセスする。 ベクトル型は float4 が使える。

ということで実行結果:0.320秒

  • 同じ環境でC言語版が1.756秒だったので、CPUシングルスレッドに比べ5.49倍の高速化
    • 計測はプロセスの起動や画像の保存なども含んでいるので、精度はいまいち
  • NVIDIA Quadro FX 570
    • Intel(R) Core(TM)2 Duo CPU, E8400 @ 3.00GHz, 2.99 GHz, 2.00 GB RAM

以下ソース:

// includes, system
#include <stdio.h>

// includes, project
#include <cutil_inline.h>
#include <cuda.h>

#include "vec.h"

const int WIDTH = 256;
const int HEIGHT = 256;
const int NSUBSAMPLES = 2;
const int NAO_SAMPLES = 8;

__device__ const float M_PI = 3.1415926535;
__device__ const float NoHit = 1e16;

typedef struct {
vec center;
float radius;
} Sphere;

typedef struct {
float a, b, c, d;
} Plane;

typedef struct {
float t;
vec p;
vec n;
} Isect;


__device__ __host__ float sq(float x) { return x * x; }

__device__ __host__ void ray_sphere_intersect(Isect* isect, const vec* orig, const vec* dir, const vec* sphere_center, float sphere_radius) {
vec v;
vsub(&v, orig, sphere_center);
float dd = sq(vdot(&v, dir)) - (vsqlength(&v) - sq(sphere_radius));
if (dd >= 0) {
float sqdd = sqrt(dd);
float t1 = -vdot(&v, dir) - sqdd;
if (0 < t1 && t1 < isect->t) {
isect->t = t1;

vec tmp;
vadd(&isect->p, orig, vscale(&tmp, dir, t1));

vnormalize(vsub(&isect->n, &isect->p, sphere_center));
}
}
}

__device__ __host__ void ray_plane_intersect(Isect* isect, const vec* orig, const vec* dir, float a, float b, float c, float d) {
float h = orig->x * a + orig->y * b + orig->z * c + d;
if (h >= 0) {
float dot = dir->x * a + dir->y * b + dir->z * c;
if (dot < 0) {
float t = h / -dot;
if (t < isect->t) {
isect->t = t;

vec tmp;
vadd(&isect->p, orig, vscale(&tmp, dir, t));

isect->n.x = a;
isect->n.y = b;
isect->n.z = c;
}
}
}
}

__device__ __host__ void orthoBasis(vec* basis, const vec& n) {
basis[2] = n;
basis[1].x = basis[1].y = basis[1].z = 0;

if (n.x < 0.6 && n.x > -0.6) {
basis[1].x = 1;
} else if (n.y < 0.6 && n.y > -0.6) {
basis[1].y = 1;
} else if (n.z < 0.6 && n.z > -0.6) {
basis[1].z = 1;
} else {
basis[1].x = 1;
}

vnormalize(vcross(&basis[0], &basis[1], &basis[2]));
vnormalize(vcross(&basis[1], &basis[2], &basis[0]));
}


__device__ const Sphere spheres[3] = {
{ { -2.0, 0.0, -3.5 }, 0.5 },
{ { -0.5, 0.0, -3.0 }, 0.5 },
{ { 1.0, 0.0, -2.2 }, 0.5 },
};

__device__ const Plane plane = { 0, 1, 0, 0.5 };


__device__ __host__ void ambient_occlusion(vec* col, const Isect* isect, float* rands) {
const float eps = 0.0001;
const int ntheta = NAO_SAMPLES;
const int nphi = NAO_SAMPLES;

vec p;
vadd(&p, &isect->p, vscale(&p, &isect->n, eps));

vec basis[3];
orthoBasis(basis, isect->n);

int occludeCount = 0;
for (int i=0; i < ntheta * nphi; ++i) {
float theta = sqrt(rands[i*2+0]);
float phi = 2 * M_PI * rands[i*2+1];

float x = cos(phi) * theta;
float y = sin(phi) * theta;
float z = sqrt(1 - theta * theta);

// local -> global
float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;

vec raydir = { rx, ry, rz };
Isect occIsect;
occIsect.t = NoHit;
ray_sphere_intersect(&occIsect, &p, &raydir, &spheres[0].center, spheres[0].radius);
ray_sphere_intersect(&occIsect, &p, &raydir, &spheres[1].center, spheres[1].radius);
ray_sphere_intersect(&occIsect, &p, &raydir, &spheres[2].center, spheres[2].radius);
ray_plane_intersect(&occIsect, &p, &raydir, plane.a, plane.b, plane.c, plane.d);
if (occIsect.t < NoHit) occludeCount += 1;
}

float c = (float)(ntheta * nphi - occludeCount) / (ntheta * nphi);
col->x = c;
col->y = c;
col->z = c;
}

// kernel
__global__ void render(float* fimg, float* rands) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < WIDTH && y < HEIGHT) {
vec col = { 0, 0, 0 };
for (int v=0; v<NSUBSAMPLES; ++v) {
for (int u=0; u<NSUBSAMPLES; ++u) {
vec raydir;
raydir.x = (x + (u / (float)NSUBSAMPLES) - (WIDTH / 2.0)) / (WIDTH / 2.0);
raydir.y = -(y + (v / (float)NSUBSAMPLES) - (HEIGHT / 2.0)) / (HEIGHT / 2.0);
raydir.z = -1;
vnormalize(&raydir);

const vec eyepos = { 0, 0, 0 };

Isect isect;
isect.t = NoHit;
ray_sphere_intersect(&isect, &eyepos, &raydir, &spheres[0].center, spheres[0].radius);
ray_sphere_intersect(&isect, &eyepos, &raydir, &spheres[1].center, spheres[1].radius);
ray_sphere_intersect(&isect, &eyepos, &raydir, &spheres[2].center, spheres[2].radius);
ray_plane_intersect(&isect, &eyepos, &raydir, plane.a, plane.b, plane.c, plane.d);
if (isect.t < NoHit) {
vec c;
ambient_occlusion(&c, &isect, rands);
vadd(&col, &col, &c);
}
}
}

fimg[(y*WIDTH+x)*3+0] = col.x * (1.0 / sq(NSUBSAMPLES));
fimg[(y*WIDTH+x)*3+1] = col.y * (1.0 / sq(NSUBSAMPLES));
fimg[(y*WIDTH+x)*3+2] = col.z * (1.0 / sq(NSUBSAMPLES));
}
}

float clamp(float x, float min, float max) {
return x < min ? min : x > max ? max : x;
}

bool saveppm(const char* fn, int w, int h, const unsigned char* img) {
FILE* fp = fopen(fn, "wb");
if (fp == NULL) {
return false;
} else {
fprintf(fp,"P6\n%d %d\n%d\n", w, h, 255);
fwrite(img, w * h * 3, 1, fp);
return true;
}
}

int main() {
float* d_fimg;
float* d_rands;

// allocate device memory
cutilSafeCall(cudaMalloc((void**) &d_fimg, sizeof(float) * (WIDTH * HEIGHT) * 3));
cutilSafeCall(cudaMalloc((void**) &d_rands, sizeof(float) * NAO_SAMPLES * NAO_SAMPLES * 2));

{ // 乱数テーブル
float h_rands[NAO_SAMPLES * NAO_SAMPLES * 2];
for (int i=0; i<NAO_SAMPLES * NAO_SAMPLES * 2; ++i) h_rands[i] = (float)rand() / (float)RAND_MAX;
// copy host memory to device
cutilSafeCall( cudaMemcpy( d_rands, h_rands, sizeof(h_rands), cudaMemcpyHostToDevice) );
}

dim3 dimBlock(16, 16);
dim3 dimGrid( (WIDTH + dimBlock.x - 1) / dimBlock.x, (HEIGHT + dimBlock.y - 1) / dimBlock.y);
// execute the kernel
render<<<dimGrid, dimBlock>>>(d_fimg, d_rands);

// check if kernel execution generated and error
cutilCheckMsg("Kernel execution failed");

// 結果取り出し
{
const int N = WIDTH * HEIGHT * 3;
float* result = new float[N];
// copy result from device to host
cutilSafeCall( cudaMemcpy( result, d_fimg, sizeof(float) * N, cudaMemcpyDeviceToHost) );

unsigned char* img = new unsigned char[N];
for (int i=0; i<N; ++i) img[i] = (unsigned char)(clamp(result[i], 0, 1) * 255.99f);
saveppm("ao.ppm", WIDTH, HEIGHT, img);

delete[] result;
delete[] img;
}

// cleanup memory
cutilSafeCall(cudaFree(d_fimg));
cutilSafeCall(cudaFree(d_rands));
}

vec.h

typedef float4 vec;

/* return vector sum c = a+b */
__device__ __host__ vec* vadd(vec* dst, const vec* v1, const vec* v2) {
dst->x = v1->x + v2->x;
dst->y = v1->y + v2->y;
dst->z = v1->z + v2->z;
return dst;
}

/* return vector difference c = a-b */
__device__ __host__ vec* vsub(vec* dst, const vec* v1, const vec* v2) {
dst->x = v1->x - v2->x;
dst->y = v1->y - v2->y;
dst->z = v1->z - v2->z;
return dst;
}

/* return vector scale c = a*s */
__device__ __host__ vec* vscale(vec* dst, const vec* v, float s) {
dst->x = v->x * s;
dst->y = v->y * s;
dst->z = v->z * s;
return dst;
}

/* return the dot product of vectors a and b */
__device__ __host__ float vdot(const vec* a, const vec* b) {
return a->x * b->x + a->y * b->y + a->z * b->z;
}

/* return the cross product of vectors a and b */
__device__ __host__ vec* vcross(vec* dst, const vec* a, const vec* b) {
dst->x = a->y * b->z - a->z * b->y;
dst->y = a->z * b->x - a->x * b->z;
dst->z = a->x * b->y - a->y * b->x;
return dst;
}

/* returns squared length of input vector */
__device__ __host__ float vsqlength(const vec* v) {
return v->x * v->x + v->y * v->y + v->z * v->z;
}

/* returns length of input vector */
__device__ __host__ float vlength(const vec* v) {
return sqrt(vsqlength(v));
}

/* normalizes the input vector and returns it */
__device__ __host__ vec* vnormalize(vec* v) {
float len = vlength(v);
if (len != 0.0) {
v->x /= len;
v->y /= len;
v->z /= len;
}
return v;
}
  • ほとんどC版そのままでいける
    • 拡張子を.cuにして、Cの関数とCUDAの関数などを一緒くたに書ける、どうなってるの?
    • 関数名の頭に「device __host__」を追加したくらい
  • 乱数の作り方がわからなかったので、固定の乱数テーブルを使うことにしたら残念な結果に→
  • 3fpsなら、十分リアルタイムとうたえるよね
    • GPUAOは1024x768で10msくらいとのこと、速いなぁ
  • 0.0とか書いて、倍精度になってないかちと心配…
  • なんかこう、並列化するところを自分で考えてデータ構造考えて転送して書き戻して…とかメンドイ。OpenMPとかparallel_forとか書くだけで速くなる、とかが楽でいいな
  • 実際にGPUでどんな風に実行されてるのか、皆目検討つかないぜ
    • もっと低レベルな状態で触らせて欲しい

参考