The overnight tests of blender showed that the the ‘-finline-functions’ flag was what was causing the ‘perlin()’ function to improve by a factor of twelve. The hope was that an optimizer flag can improve the function without negative effects on the other functions. Unfortunately this was not the case, the -finline-functions flag may have improved performance on the one function but every other function has increased in run time.

Below is the gprof of the finline-functions, the comparison is based the results posted previously.

% cumulative

time seconds name

44.26 1461.83 ccl::QBVH_bvh_intersect_hair(ccl::KernelGlobals*, ccl::Ray const*, ccl::Intersection*, unsigned int, unsigned int*, float, float)

13.56 1909.60 ccl::noise_turbulence(ccl::float3, float, int) [clone .constprop.197]

7.54 2158.54 ccl::QBVH_bvh_intersect_shadow_all_hair(ccl::KernelGlobals*, ccl::Ray const*, ccl::Intersection*, unsigned int, unsigned int, unsigned int*)

7.20 2396.40 GaussianYBlurOperation::executePixel(float*, int, int, void*)

3.56 2513.84 ccl::svm_eval_nodes(ccl::KernelGlobals*, ccl::ShaderData*, ccl::PathState*, ccl::ShaderType, int)

3.05 2614.52 ccl::kernel_path_trace(ccl::KernelGlobals*, float*, int, int, int, int, int)

2.06 2682.40 ccl::shader_setup_from_ray(ccl::KernelGlobals*, ccl::ShaderData*, ccl::Intersection const*, ccl::Ray const*)

1.88 2744.62 ccl::light_sample(ccl::KernelGlobals*, float, float, float, ccl::float3, int, ccl::LightSample*)

1.85 2805.79 ccl::kernel_path_surface_bounce(ccl::KernelGlobals*, ccl::ShaderData*, ccl::float3*, ccl::PathState*, ccl::PathRadianceState*, ccl::Ray*)

1.58 2858.14 GaussianXBlurOperation::executePixel(float*, int, int, void*)

1.03 2892.22 ccl::primitive_tangent(ccl::KernelGlobals*, ccl::ShaderData*)

0.91 2922.42 svbvh_node_stack_raycast(SVBVHNode*, Isect*)

0.91 2952.52 ccl::perlin(float, float, float)

Something to notice, my worry about the optimization causing another function called ‘microfacet_beckmann()’ to be called in place or ‘perlin’ was unfounded.

Another thing to notice is that every other call has increased runtime. This may mean we want to isolate this function and simply inline it on it’s own.

Let’s take a look at this function.

#ifndef __KERNEL_SSE2__

ccl_device_noinline float perlin(float x, float y, float z)

{

int X; float fx = floorfrac(x, &X);

int Y; float fy = floorfrac(y, &Y);

int Z; float fz = floorfrac(z, &Z);

float u = fade(fx);

float v = fade(fy);

float w = fade(fz);

float result;

result = nerp (w, nerp (v, nerp (u, grad (hash (X , Y , Z ), fx , fy , fz ),

grad (hash (X+1, Y , Z ), fx-1.0f, fy , fz )),

nerp (u, grad (hash (X , Y+1, Z ), fx , fy-1.0f, fz ),

grad (hash (X+1, Y+1, Z ), fx-1.0f, fy-1.0f, fz ))),

nerp (v, nerp (u, grad (hash (X , Y , Z+1), fx , fy , fz-1.0f ),

grad (hash (X+1, Y , Z+1), fx-1.0f, fy , fz-1.0f )),

nerp (u, grad (hash (X , Y+1, Z+1), fx , fy-1.0f, fz-1.0f ),

grad (hash (X+1, Y+1, Z+1), fx-1.0f, fy-1.0f, fz-1.0f ))));

float r = scale3(result);

/* can happen for big coordinates, things even out to 0.0 then anyway */

return (isfinite(r))? r: 0.0f;

}

#else

ccl_device_noinline float perlin(float x, float y, float z)

{

ssef xyz = ssef(x, y, z, 0.0f);

ssei XYZ;

ssef fxyz = floorfrac_sse(xyz, &XYZ);

ssef uvw = fade_sse(&fxyz);

ssef u = shuffle(uvw), v = shuffle(uvw), w = shuffle(uvw);

ssei XYZ_ofc = XYZ + ssei(1);

ssei vdy = shuffle(XYZ, XYZ_ofc); // +0, +0, +1, +1

ssei vdz = shuffle(shuffle(XYZ, XYZ_ofc)); // +0, +1, +0, +1

ssei h1 = hash_sse(shuffle(XYZ), vdy, vdz); // hash directions 000, 001, 010, 011

ssei h2 = hash_sse(shuffle(XYZ_ofc), vdy, vdz); // hash directions 100, 101, 110, 111

ssef fxyz_ofc = fxyz - ssef(1.0f);

ssef vfy = shuffle(fxyz, fxyz_ofc);

ssef vfz = shuffle(shuffle(fxyz, fxyz_ofc));

ssef g1 = grad_sse(h1, shuffle(fxyz), vfy, vfz);

ssef g2 = grad_sse(h2, shuffle(fxyz_ofc), vfy, vfz);

ssef n1 = nerp_sse(u, g1, g2);

ssef n1_half = shuffle(n1); // extract 2 floats to a separate vector

ssef n2 = nerp_sse(v, n1, n1_half); // process nerp([a, b, _, _], [c, d, _, _]) -> [a', b', _, _]

ssef n2_second = shuffle(n2); // extract b to a separate vector

ssef result = nerp_sse(w, n2, n2_second); // process nerp([a', _, _, _], [b', _, _, _]) -> [a'', _, _, _]

ssef r = scale3_sse(result);

ssef infmask = cast(ssei(0x7f800000));

ssef rinfmask = ((r & infmask) == infmask).m128; // 0xffffffff if r is inf/-inf/nan else 0

ssef rfinite = andnot(rinfmask, r); // 0 if r is inf/-inf/nan else r

return extract(rfinite);

}

#endif

You can see that this function is divided into a SIMD and non SIMD versions, because this build is X_86 I will assume that it compiled as the SIMD version.

For some reason this function is the no-inline declaration, Im unsure of why this might be the case and if I had the time I would rebuild Blender with only perlin as an inline function.

Unfortunately that would be out of scope as Im just testing optimizer flags in this project. Through sheer brute force it is clear that individual optimization flags aren’t the way to improve performance.

Below is a table of each optimization flag and it’s corresponding effect on blender’s runtime.

Flag Runtime/Seconds
-O2 3245.36
-fvect-cost-model 3242.05
-floop-unroll-and-jam 3247.36
-ftree-partial-pre 3247.84
-ftree-loop-distribute-patterns 3251.57
-fsplit-paths 3252.05
-floop-interchange 3255.06
-ftree-slp-vectorize 3255.77
-ftree-loop-vectorize 3260.45
-fpredictive-commoning 3266.47
-fgcse-after-reload 3275.78
-ftree-loop-distribution 3288.16
-fpeel-loops 3283.03
-fipa-cp-clone 3283.68
-finline-functions 3303.00
-funswitch-loops 3306.21
-O3 3350.36