| #include <stdio.h>
 
 
 #include <cutil_inline.h>
 #include <cuda.h>
 
 #include "vec.h"
 
 const int WIDTH  = 256;
 const int HEIGHT = 256;
 const int NSUBSAMPLES = 2;
 const int NAO_SAMPLES = 8;
 
 __device__ const float M_PI = 3.1415926535;
 __device__ const float NoHit = 1e16;
 
 typedef struct {
 vec center;
 float radius;
 } Sphere;
 
 typedef struct {
 float a, b, c, d;
 } Plane;
 
 typedef struct {
 float t;
 vec p;
 vec n;
 } Isect;
 
 
 __device__ __host__ float sq(float x) { return x * x; }
 
 __device__ __host__ void ray_sphere_intersect(Isect* isect, const vec* orig, const vec* dir, const vec* sphere_center, float sphere_radius) {
 vec v;
 vsub(&v, orig, sphere_center);
 float dd = sq(vdot(&v, dir)) - (vsqlength(&v) - sq(sphere_radius));
 if (dd >= 0) {
 float sqdd = sqrt(dd);
 float t1 = -vdot(&v, dir) - sqdd;
 if (0 < t1 && t1 < isect->t) {
 isect->t = t1;
 
 vec tmp;
 vadd(&isect->p, orig, vscale(&tmp, dir, t1));
 
 vnormalize(vsub(&isect->n, &isect->p, sphere_center));
 }
 }
 }
 
 __device__ __host__ void ray_plane_intersect(Isect* isect, const vec* orig, const vec* dir, float a, float b, float c, float d) {
 float h = orig->x * a + orig->y * b + orig->z * c + d;
 if (h >= 0) {
 float dot = dir->x * a + dir->y * b + dir->z * c;
 if (dot < 0) {
 float t = h / -dot;
 if (t < isect->t) {
 isect->t = t;
 
 vec tmp;
 vadd(&isect->p, orig, vscale(&tmp, dir, t));
 
 isect->n.x = a;
 isect->n.y = b;
 isect->n.z = c;
 }
 }
 }
 }
 
 __device__ __host__ void orthoBasis(vec* basis, const vec& n) {
 basis[2] = n;
 basis[1].x = basis[1].y = basis[1].z = 0;
 
 if (n.x < 0.6 && n.x > -0.6) {
 basis[1].x = 1;
 } else if (n.y < 0.6 && n.y > -0.6) {
 basis[1].y = 1;
 } else if (n.z < 0.6 && n.z > -0.6) {
 basis[1].z = 1;
 } else {
 basis[1].x = 1;
 }
 
 vnormalize(vcross(&basis[0], &basis[1], &basis[2]));
 vnormalize(vcross(&basis[1], &basis[2], &basis[0]));
 }
 
 
 __device__ const Sphere spheres[3] = {
 {    { -2.0, 0.0, -3.5 }, 0.5    },
 {    { -0.5, 0.0, -3.0 }, 0.5    },
 {    {  1.0, 0.0, -2.2 }, 0.5    },
 };
 
 __device__ const Plane plane = { 0, 1, 0, 0.5 };
 
 
 __device__ __host__ void ambient_occlusion(vec* col, const Isect* isect, float* rands) {
 const float eps = 0.0001;
 const int ntheta = NAO_SAMPLES;
 const int nphi   = NAO_SAMPLES;
 
 vec p;
 vadd(&p, &isect->p, vscale(&p, &isect->n, eps));
 
 vec basis[3];
 orthoBasis(basis, isect->n);
 
 int occludeCount = 0;
 for (int i=0; i < ntheta * nphi; ++i) {
 float theta = sqrt(rands[i*2+0]);
 float phi   = 2 * M_PI * rands[i*2+1];
 
 float x = cos(phi) * theta;
 float y = sin(phi) * theta;
 float z = sqrt(1 - theta * theta);
 
 
 float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
 float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
 float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;
 
 vec raydir = { rx, ry, rz };
 Isect occIsect;
 occIsect.t = NoHit;
 ray_sphere_intersect(&occIsect, &p, &raydir, &spheres[0].center, spheres[0].radius);
 ray_sphere_intersect(&occIsect, &p, &raydir, &spheres[1].center, spheres[1].radius);
 ray_sphere_intersect(&occIsect, &p, &raydir, &spheres[2].center, spheres[2].radius);
 ray_plane_intersect(&occIsect, &p, &raydir, plane.a, plane.b, plane.c, plane.d);
 if (occIsect.t < NoHit)    occludeCount += 1;
 }
 
 float c = (float)(ntheta * nphi - occludeCount) / (ntheta * nphi);
 col->x = c;
 col->y = c;
 col->z = c;
 }
 
 
 __global__ void render(float* fimg, float* rands) {
 int x = blockIdx.x * blockDim.x + threadIdx.x;
 int y = blockIdx.y * blockDim.y + threadIdx.y;
 if (x < WIDTH && y < HEIGHT) {
 vec col = { 0, 0, 0 };
 for (int v=0; v<NSUBSAMPLES; ++v) {
 for (int u=0; u<NSUBSAMPLES; ++u) {
 vec raydir;
 raydir.x = (x + (u / (float)NSUBSAMPLES) - (WIDTH / 2.0)) / (WIDTH / 2.0);
 raydir.y = -(y + (v / (float)NSUBSAMPLES) - (HEIGHT / 2.0)) / (HEIGHT / 2.0);
 raydir.z = -1;
 vnormalize(&raydir);
 
 const vec eyepos = { 0, 0, 0 };
 
 Isect isect;
 isect.t = NoHit;
 ray_sphere_intersect(&isect, &eyepos, &raydir, &spheres[0].center, spheres[0].radius);
 ray_sphere_intersect(&isect, &eyepos, &raydir, &spheres[1].center, spheres[1].radius);
 ray_sphere_intersect(&isect, &eyepos, &raydir, &spheres[2].center, spheres[2].radius);
 ray_plane_intersect(&isect, &eyepos, &raydir, plane.a, plane.b, plane.c, plane.d);
 if (isect.t < NoHit) {
 vec c;
 ambient_occlusion(&c, &isect, rands);
 vadd(&col, &col, &c);
 }
 }
 }
 
 fimg[(y*WIDTH+x)*3+0] = col.x * (1.0 / sq(NSUBSAMPLES));
 fimg[(y*WIDTH+x)*3+1] = col.y * (1.0 / sq(NSUBSAMPLES));
 fimg[(y*WIDTH+x)*3+2] = col.z * (1.0 / sq(NSUBSAMPLES));
 }
 }
 
 float clamp(float x, float min, float max) {
 return x < min ? min : x > max ? max : x;
 }
 
 bool saveppm(const char* fn, int w, int h, const unsigned char* img) {
 FILE* fp = fopen(fn, "wb");
 if (fp == NULL) {
 return false;
 } else {
 fprintf(fp,"P6\n%d %d\n%d\n", w, h, 255);
 fwrite(img, w * h * 3, 1, fp);
 return true;
 }
 }
 
 int main() {
 float* d_fimg;
 float* d_rands;
 
 
 cutilSafeCall(cudaMalloc((void**) &d_fimg, sizeof(float) * (WIDTH * HEIGHT) * 3));
 cutilSafeCall(cudaMalloc((void**) &d_rands, sizeof(float) * NAO_SAMPLES * NAO_SAMPLES * 2));
 
 {
 float h_rands[NAO_SAMPLES * NAO_SAMPLES * 2];
 for (int i=0; i<NAO_SAMPLES * NAO_SAMPLES * 2; ++i)    h_rands[i] = (float)rand() / (float)RAND_MAX;
 
 cutilSafeCall( cudaMemcpy( d_rands, h_rands, sizeof(h_rands), cudaMemcpyHostToDevice) );
 }
 
 dim3 dimBlock(16, 16);
 dim3 dimGrid( (WIDTH  + dimBlock.x - 1) / dimBlock.x, (HEIGHT + dimBlock.y - 1) / dimBlock.y);
 
 render<<<dimGrid, dimBlock>>>(d_fimg, d_rands);
 
 
 cutilCheckMsg("Kernel execution failed");
 
 
 {
 const int N = WIDTH * HEIGHT * 3;
 float* result = new float[N];
 
 cutilSafeCall( cudaMemcpy( result, d_fimg, sizeof(float) * N, cudaMemcpyDeviceToHost) );
 
 unsigned char* img = new unsigned char[N];
 for (int i=0; i<N; ++i)    img[i] = (unsigned char)(clamp(result[i], 0, 1) * 255.99f);
 saveppm("ao.ppm", WIDTH, HEIGHT, img);
 
 delete[] result;
 delete[] img;
 }
 
 
 cutilSafeCall(cudaFree(d_fimg));
 cutilSafeCall(cudaFree(d_rands));
 }
 
 |