// GLSL Sphere raytracing int refl; for( refl = 0; refl < REFLECTIONS; refl++ ) { float rt_dist = 0.0f; int rt_i = -1; vec3 rt_t0; vec4 v0, v1, v2, v3; v0 = vec4( ray_pos - obj_pos[0].xyz, obj_pos[0].w ); v1 = vec4( ray_pos - obj_pos[1].xyz, obj_pos[1].w ); v2 = vec4( ray_pos - obj_pos[2].xyz, obj_pos[2].w ); v3 = vec4( ray_pos - obj_pos[3].xyz, obj_pos[3].w ); for( int i = 0; i < OBJNUM; i += 32 ) { vec4 v4, v5, v6, v7; vec4 b0, d0; v4 = vec4( ray_pos - obj_pos[i+4].xyz, obj_pos[i+4].w ); v5 = vec4( ray_pos - obj_pos[i+5].xyz, obj_pos[i+5].w ); v6 = vec4( ray_pos - obj_pos[i+6].xyz, obj_pos[i+6].w ); v7 = vec4( ray_pos - obj_pos[i+7].xyz, obj_pos[i+7].w ); b0 = vec4( dot( v0.xyz, ray_dir ), dot( v1.xyz, ray_dir ), dot( v2.xyz, ray_dir ), dot( v3.xyz, ray_dir ) ); d0 = vec4( b0.x * b0.x - dot( v0.xyz, v0.xyz ) + v0.w, b0.y * b0.y - dot( v1.xyz, v1.xyz ) + v1.w, b0.z * b0.z - dot( v2.xyz, v2.xyz ) + v2.w, b0.w * b0.w - dot( v3.xyz, v3.xyz ) + v3.w ); if( any( greaterThan( d0, vec4( 0.0f ) ) ) ) { vec4 t0 = 1.0f / (-b0 - sqrt( d0 )); float mv = max( max( t0.x, t0.y ), max( t0.z, t0.w ) ); if( mv > rt_dist ) { rt_dist = mv; rt_t0 = t0.yzw; rt_i = i + 0; } } // ... ditto for the rest ... // inverted rd_dist; "close enough" cutoff if( rt_dist > 0.33f ) break; } if( rt_i == -1 ) break; ivec3 iv = ivec3( equal( rt_t0, vec3( rt_dist ) ) ); rt_i += iv.x + iv.y + iv.y + iv.z + iv.z + iv.z; ray_pos += ray_dir * (1.0f / rt_dist); vec3 n = normalize( ray_pos - obj_pos[rt_i].xyz ); vec3 l = normalize( light_pos.xyz - ray_pos ); float diffuse = max( dot( n, l ), 0.0f ); float specular = pow( max( dot( ray_dir, l - n * 2.0f * diffuse ), 0.0f ), 8.0f ); col += vec3( specular ) + obj_col[rt_i].xyz * diffuse; ray_dir = reflect( ray_dir, n ); } // Standard noise calculation: float hash(float h) { return fract(sin(h) * 43758.5453123); } float noise(vec3 x) { vec3 p = floor(x); vec3 f = fract(x); f = f * f * (3.0 - 2.0 * f); float n = p.x + p.y * 157.0 + 113.0 * p.z; return mix( mix(mix(hash(n + 0.0), hash(n + 1.0), f.x), mix(hash(n + 157.0), hash(n + 158.0), f.x), f.y), mix(mix(hash(n + 113.0), hash(n + 114.0), f.x), mix(hash(n + 270.0), hash(n + 271.0), f.x), f.y), f.z); } // Super-fast noise calculation: // Precalculate this on the CPU: // Overlapping table: Reduces the entire noise calculation to two LDC.F32X4. for( int i = 0; i < TABLELEN; i++ ) { v4 v; v.x = fract( sinf( ((i + 0)&TABLEMASK) * 43758.5453123f ) ); v.y = fract( sinf( ((i + 157)&TABLEMASK) * 43758.5453123f ) ); v.z = fract( sinf( ((i + 113)&TABLEMASK) * 43758.5453123f ) ); v.w = fract( sinf( ((i + 270)&TABLEMASK) * 43758.5453123f ) ); gpu3d.noisetable[i] = v; } float noise( vec3 x00 ) { ivec3 p00 = ivec3( floor( x00 ) ); vec3 f00 = fract( x00 ); f00 = f00 * f00 * ( 3.0 - 2.0 * f00 ); uint n00 = uint( abs( p00.x + p00.y * 157 + 113 * p00.z ) ); vec4 h00 = noisetable[(n00+0u)&uint(TABLEMASK)]; vec4 h01 = noisetable[(n00+1u)&uint(TABLEMASK)]; h00 = mix( h00, h01, f00.x ); h00.xy = mix( h00.xz, h00.yw, f00.y ); h00.x = mix( h00.x, h00.y, f00.z ); return h00.x; } // Quaternion intersection calculation (in HLSL) float4 quatMult( float4 q1, float4 q2 ) { float4 r; r.x = q1.x*q2.x - dot( q1.yzw, q2.yzw ); r.yzw = q1.x*q2.yzw + q2.x*q1.yzw + cross( q1.yzw, q2.yzw ); return r; } float4 quatSq( float4 q ) { float4 r; r.x = q.x*q.x - dot( q.yzw, q.yzw ); r.yzw = 2*q.x*q.yzw; return r; } void iterateIntersect( inout float4 q, inout float4 qp, float4 c, int maxIterations ) { for( int i=0; i ESCAPE_THRESHOLD ) { break; } } } // Faster version (in GLSL) vec4 quatMult( vec4 q1, vec4 q2 ) { vec4 r; r.x = q1.x * q2.x - dot( q1.yzw, q2.yzw ); r.yzw = q1.x * q2.yzw + q2.x * q1.yzw + q1.zwy * q2.wyz - q1.wyz * q2.zwy; return r; } vec4 iterateIntersect( inout vec4 q1, vec4 c ) { bool r0, r1; vec4 q0; vec4 qp0, qp1; qp0 = 2.0f * q1; q0 = quatSq( q1 ) + c; qp1 = 2.0f * quatMult( q0, qp0 ); q1 = quatSq( q0 ) + c; r0 = dot( q0, q0 ) > ESCAPE_THRESHOLD; r1 = dot( q1, q1 ) > ESCAPE_THRESHOLD; if( r0 || r1 ) { q1 = r0 ? q0 : q1; return r0 ? qp0 : qp1; } for( int i = 0; i < ITERATIONS-2; i += 2 ) { qp0 = 2.0f * quatMult( q1, qp1 ); q0 = quatSq( q1 ) + c; qp1 = 2.0f * quatMult( q0, qp0 ); q1 = quatSq( q0 ) + c; r0 = dot( q0, q0 ) > ESCAPE_THRESHOLD; r1 = dot( q1, q1 ) > ESCAPE_THRESHOLD; if( r0 || r1 ) break; } q1 = r0 ? q0 : q1; return r0 ? qp0 : qp1; } // NVCC generated assembler code: MAD.F32 R0.z, R6, R0.w, R0.x; ADD.U R0.x, R0.y, {1, 0, 0, 0}; MUL.F32 R4.w, R0.z, {0.25, 0, 0, 0}.x; FRC.F32 R5.xyz, R5; MAD.F32 R6.xyz, -R5, {2, 3, 0, 0}.x, {2, 3, 0, 0}.y; AND.U R0.x, R0, {255, 0, 0, 0}; AND.U R0.y, R0, {255, 0, 0, 0}.x; MUL.S R0.x, R0, {16, 0, 0, 0}; MUL.S R0.y, R0, {16, 0, 0, 0}.x; MOV.U R0.y, R0; MUL.F32 R5.xyz, R5, R5; LDC.F32X4 R1, buf0[R0.y + 16]; MOV.U R0.x, R0; LDC.F32X4 R0, buf0[R0.x + 16]; MUL.F32 R5.xyz, R5, R6; ADD.F32 R0, R0, -R1; MAD.F32 R0, R5.x, R0, R1; ADD.F32 R1.xy, R0.ywzw, -R0.xzzw; MUL.F32 R0.w, R4.z, {0.25, 0, 0, 0}.x; MUL.F32 R0.y, R4.z, {0.180030003, 0, 0, 0}.x; COS.F32 R0.w, R0.w; SIN.F32 R0.y, R0.y; MUL.F32 R1.z, R0.y, {3.6230998, 0, 0, 0}.x; MAD.F32 R0.xy, R5.y, R1, R0.xzzw; ADD.F32 R0.y, R0, -R0.x; MUL.F32 R1.w, R0, {2.43239999, 0, 0, 0}.x; ADD.F32 R1.zw, R4.xyxy, -R1; DP2.F32 R0.z, R1.zwzw, R1.zwzw; ADD.F32 R4.xyz, R2, {-0.00100000005, -0, 0, 0}.xyyw; MAD.F32 R0.x, R5.z, R0.y, R0; MAD.F32 R3.w, R3, {0.5, 0, 0, 0}.x, R4; MAD.F32 R0.x, R0, {0.125, 0, 0, 0}, R3.w; ADD.F32 R0.x, R0, {-0.319000006, 0, 0, 0}; RSQ.F32 R0.z, R0.z; RCP.F32 R0.y, R0.z; ADD.F32 R0.y, R0, {-0.856600046, 0, 0, 0}.x; MAD.F32 R0.w, -R0.x, {2.99800014, 0, 0, 0}.x, -R0.y; MAD.F32.SAT R1.x, R0.w, {0.25, 0.5, 0, 0}, {0.25, 0.5, 0, 0}.y; MUL.F32 R1.y, R0.x, {2.99800014, 0, 0, 0}.x; MAD.F32 R3.w, R1.x, R0, R1.y; MUL.F32 R5.xyz, R4, {0.333555698, 0, 0, 0}.x; MUL.F32 R0.w, R1.x, {2, 0, 0, 0}.x; FLR.F R0.xyz, R5; TRUNC.S R0.xyz, R0; MAD.F32 R4.w, -R1.x, R0, R0; MAD.S R0.x, R0.y, {157, 0, 0, 0}, R0; MAD.S R0.w, R0.z, {113, 0, 0, 0}.x, R0.x; FRC.F32 R0.xyz, R5; MAD.F32 R1.xyz, -R0, {2, 3, 0, 0}.x, {2, 3, 0, 0}.y; MUL.F32 R0.xyz, R0, R0; MUL.F32 R6.xyz, R0, R1; MOV.S R0.w, |R0|; ADD.U R0.x, R0.w, {1, 0, 0, 0}; AND.U R0.y, R0.w, {255, 0, 0, 0}.x; AND.U R0.x, R0, {255, 0, 0, 0}; MUL.S R0.y, R0, {16, 0, 0, 0}.x; MOV.U R0.y, R0; MUL.S R0.x, R0, {16, 0, 0, 0}; LDC.F32X4 R1, buf0[R0.y + 16]; MOV.U R0.x, R0; LDC.F32X4 R0, buf0[R0.x + 16]; ADD.F32 R0, R0, -R1; MAD.F32 R0, R6.x, R0, R1; ADD.F32 R7.xy, R0.ywzw, -R0.xzzw; MAD.F32 R0.xy, R6.y, R7, R0.xzzw; MUL.F32 R5.xyz, R5, {2.00999999, 0, 0, 0}.x; FLR.F R1.xyz, R5; TRUNC.S R1.xyz, R1; MAD.S R0.z, R1.y, {157, 0, 0, 0}.x, R1.x; // Regular sea simulation float hash( vec2 p ) { float h = dot(p,vec2(127.1,311.7)); return fract(sin(h)*43758.5453123); } float noise( in vec2 p ) { vec2 i = floor( p ); vec2 f = fract( p ); vec2 u = f*f*(3.0-2.0*f); return -1.0+2.0*mix( mix( hash( i + vec2(0.0,0.0) ), hash( i + vec2(1.0,0.0) ), u.x), mix( hash( i + vec2(0.0,1.0) ), hash( i + vec2(1.0,1.0) ), u.x), u.y); } float sea_octave(vec2 uv, float choppy) { uv += noise(uv); vec2 wv = 1.0-abs(sin(uv)); vec2 swv = abs(cos(uv)); wv = mix(wv,swv,wv); return pow(1.0-pow(wv.x * wv.y,0.65),choppy); } float map(vec3 p) { float freq = SEA_FREQ; float amp = SEA_HEIGHT; float choppy = SEA_CHOPPY; vec2 uv = p.xz; uv.x *= 0.75; float d, h = 0.0; for(int i = 0; i < ITER_GEOMETRY; i++) { d = sea_octave((uv+SEA_TIME)*freq,choppy); d += sea_octave((uv-SEA_TIME)*freq,choppy); h += d * amp; uv *= octave_m; freq *= 1.9; amp *= 0.22; choppy = mix(choppy,1.0,0.2); } return p.y - h; } // Fast sea simulation... it's "sea'ish" float noise( in vec2 p ) { float f = ( sin( p.x * 1.91f ) + cos( p.y * 1.71f ) ) * 0.75f; return f; } // sea float sea_octave( vec2 uv, float choppy ) { uv += noise(uv); vec2 wv = 1.0-abs(sin(uv)); vec2 swv = abs(cos(uv)); wv = mix(wv,swv,wv); return pow(1.0-pow(wv.x * wv.y,0.65),choppy); } float map( vec3 p ) { float freq = SEA_FREQ; float amp = SEA_HEIGHT; float choppy = SEA_CHOPPY; vec2 uv = p.xz; uv.x *= 0.75; float d, h = 0.0; for(int i = 0; i < ITER_GEOMETRY; i++) { d = sea_octave((uv+SEA_TIME)*freq,choppy) * 1.4f; h += d * amp; uv *= octave_m; freq *= 1.9; amp *= 0.22; choppy = mix(choppy,1.0,0.2); } return p.y - h; } // Normal surface calculation float map(vec3 p) { p = (cos(p*.315*2.5 + sin(p.zxy*.875*2.5))); float n = length(p); p = sin(p*6. + cos(p.yzx*6.)); return n - 1. - abs(p.x*p.y*p.z)*.05; } vec3 calcNormal(in vec3 p, float d) { const vec2 e = vec2(0.01, 0); return normalize(vec3(d - map(p - e.xyy), d - map(p - e.yxy), d - map(p - e.yyx))); } // Expand the map calls and shortcut the n calculation // for lucky surface effect: vec3 calcNormal( vec3 p, float d ) { vec3 r0 = cos( p * .315 * 2.5 + sin( p.zxy * .875 * 2.5 ) ); p -= 0.01f; vec3 r1 = cos( p * .315 * 2.5 + sin( p.zxy * .875 * 2.5 ) ); vec3 v0 = vec3( r1.x, r0.y, r0.z ); vec3 v1 = vec3( r0.x, r1.y, r0.z ); vec3 v2 = vec3( r0.x, r0.y, r1.z ); float n0 = length( v0 ); float n1 = length( v1 ); float n2 = length( v2 ); v0 = sin( v0 * 6. + cos( v0.yzx * 6. ) ); v1 = sin( v0 * 6. + cos( v0.yzx * 6. ) ); float f = - 1.0f - abs( v1.x * v1.y * v1.z ) * .05; n0 = n0 - 1.0f - abs( v0.x * v0.y * v0.z ) * .05; n1 = n1 + f; n2 = n2 + f; return normalize( vec3( d - n0, d - n1, d - n2 ) ); } // Twofield world and normal calculation: float t1(vec3 p) { float v = 0.; for (int i = 0; i < N; ++i) { vec3 b = p - b1[i]; v += 5. / dot(b, b); } float d = 12. - p.y; v += 3. / (d*d); return v; } float t2(vec3 p) { float v = 0.; for (int i = 0; i < N; ++i) { vec3 b = p - b2[i]; v += 5. / dot(b, b); } float d = 12. + p.y; v += 3. / (d*d); return v; } float w1(vec3 p) { return 1. - t1(p) + t2(p); } float w2(vec3 p) { return 1. + t1(p) - t2(p); } float world(vec3 p) { return min(w1(p), w2(p)); } vec3 normal(vec3 p) { vec2 e = vec2(.001,0.); return normalize(vec3( world(p+e.xyy) - world(p-e.xyy), world(p+e.yxy) - world(p-e.yxy), world(p+e.yyx) - world(p-e.yyx))); } // Simpler and faster: vec2 t1t2( vec3 p ) { float v1 = 0.0f; float v2 = 0.0f; for( int i = 0; i < N; ++i ) { vec3 b1 = p - b1[i].xyz; vec3 b2 = p - b2[i].xyz; v1 += 5. / dot(b1, b1); v2 += 5. / dot(b2, b2); } float d1 = 12. - p.y; v1 += 3. / (d1*d1); float d2 = 12. + p.y; v2 += 3. / (d2*d2); return vec2( v1, v2 ); } float world( vec3 p ) { vec2 t12 = t1t2( p ); float w1 = 1.0f - t12.x + t12.y; float w2 = 1.0f + t12.x - t12.y; return min( w1, w2 ); } vec3 normal( vec3 p ) { vec2 e = vec2(.001,0.); return normalize( vec3( world( p + e.xyy ) - world( p - e.xyy ), world( p + e.yxy ) - world( p - e.yxy ), world( p + e.yyx ) - world( p - e.yyx ) ) ); } // Fast'ish precise atan2 calculation: // IEEE Signal Processing Magazine ( Volume: 30, Issue: 1, Jan. 2013 ) float satan2( float y, float x ) { uint sign_mask = 0x80000000u; float b = 0.596227f; // Extract the sign bits uint ux_s = (sign_mask & f2u(x) )^sign_mask; uint uy_s = (sign_mask & f2u(y) )^sign_mask; // Determine the quadrant offset float q = float( ( ~ux_s & uy_s ) >> 29 | ux_s >> 30 ); // Calculate the arctangent in the first quadrant float bxy_a = abs( b * x * y ); float num = bxy_a + y * y; float atan_1q = num / ( x * x + bxy_a + num ); // Translate it to the proper quadrant uint uatan_2q = ((ux_s ^ uy_s) | f2u(atan_1q)); return (q + u2f(uatan_2q)) * PI/2.0f - PI; } // Knut Inge Hvidsten's faster but less precise variation: float satan2( float y, float x ) { float fx = abs( x ); float fy = abs( y ); float rho = fy / (fx + fy); int xp = int( (f2u( x )>>31u)^1u ); int yp = int( f2u( y )>>31u ); int bx = xp ^ yp; float d = float( yp + bx ) + ( float( bx^1 ) - 0.5f ) * rho; return ( 1.0f - d ) * PI; } // Raytracer CPU preprocessing static void world_probe( worldinfo *w, gpu_tracer *gt ) { uint8_t r0[BLOCKSX+1]; uint8_t r1[BLOCKSX+1]; uint8_t *row0 = r0, *row1 = NULL; poscol pc[OBJNUM]; for( int i = 0; i < OBJNUM; i++ ) { pc[i].obj_pos = w->spheres[i].obj_pos; pc[i].obj_col = w->spheres[i].obj_col; } qsort( &pc[1], OBJNUM-1, sizeof(poscol), sortfunc ); for( int i = 0; i < OBJNUM; i++ ) { gt->obj_pos[i] = pc[i].obj_pos; gt->obj_col[i] = pc[i].obj_col; } v3 ray_pos = v4_get3( gt->eye ); int widthr = (w->width +BLOCKW-1)&~(BLOCKW-1); int heightr = (w->height+BLOCKH-1)&~(BLOCKH-1); for( int y = 0; y <= heightr; y += BLOCKH ) { float fy = y/(float)w->height; uint8_t *dst = row0; for( int x = 0; x <= widthr; x += BLOCKW ) { float fx = x/(float)w->width; v3 startpos = v3_normalize( v3_set( gt->startpos.x + gt->ax * fx, gt->startpos.y + gt->ay * fy, gt->startpos.z ) ); int i; for( i = 0; i < OBJNUM; i += 4 ) { v3 vv0 = v3_sub( ray_pos, v4_get3( gt->obj_pos[i+0] ) ); float rv0 = -v3_dot( vv0, vv0 ) + gt->obj_pos[i+0].w; // ... ditto for vv1-3, rv1-3 if( v3_dotsq( vv0, startpos ) + rv0 > 0.0f (...) ) break; } *dst++ = i == OBJNUM ? BLOCK_BG : BLOCK_SPH; } if( row1 == NULL ) { row0 = r1; row1 = r0; continue; } uint8_t *tmp; tmp = row0; row0 = row1; row1 = tmp; uint8_t *proberow = gt->probe + ((y-BLOCKH)/BLOCKH)*BLOCKSX; for( int x = 0; x < widthr/BLOCKW; x++ ) { *proberow++ = row0[x] & row0[x+1] & row1[x] & row1[x+1]; } } }