@Dade: i worked on the scene you used as testcase ( proctexball-mix ):
Code: Select all
./bin/luxcoreui -D opencl.devices.select 100 -D renderengine.type PATHOCL -D sampler.type SOBOL scenes/luxball/proctexball-mix.cfg
[LuxCore][4.977] [PathOCLBaseRenderThread::0] Kernels compilation time: 3681ms ( was 43164 ms )
This time i made sure some is
not inlined
Code: Select all
[LuxCore][1.362] [PathOCLBaseRenderThread::0] Compiling kernels
[LuxCore][4.997] [PathOCLBaseRenderThread::0] Kernels not cached
[LuxCore][4.997] [PathOCLBaseRenderThread::0] Compiling Film_Clear Kernel
[LuxCore][4.997] [PathOCLBaseRenderThread::0] Compiling InitSeed Kernel
[LuxCore][4.997] [PathOCLBaseRenderThread::0] Compiling Init Kernel
[LuxCore][4.997] [PathOCLBaseRenderThread::0] Compiling AdvancePaths_MK_RT_NEXT_VERTEX Kernel
[LuxCore][4.997] [PathOCLBaseRenderThread::0] Compiling AdvancePaths_MK_HIT_NOTHING Kernel
[LuxCore][4.997] [PathOCLBaseRenderThread::0] Compiling AdvancePaths_MK_HIT_OBJECT Kernel
[LuxCore][4.997] [PathOCLBaseRenderThread::0] Compiling AdvancePaths_MK_RT_DL Kernel
[LuxCore][4.997] [PathOCLBaseRenderThread::0] Compiling AdvancePaths_MK_DL_ILLUMINATE Kernel
[LuxCore][4.997] [PathOCLBaseRenderThread::0] Compiling AdvancePaths_MK_DL_SAMPLE_BSDF Kernel
[LuxCore][4.997] [PathOCLBaseRenderThread::0] Compiling AdvancePaths_MK_GENERATE_NEXT_VERTEX_RAY Kernel
[LuxCore][4.997] [PathOCLBaseRenderThread::0] Compiling AdvancePaths_MK_SPLAT_SAMPLE Kernel
[LuxCore][4.997] [PathOCLBaseRenderThread::0] Compiling AdvancePaths_MK_NEXT_SAMPLE Kernel
[LuxCore][4.997] [PathOCLBaseRenderThread::0] Compiling AdvancePaths_MK_GENERATE_CAMERA_RAY Kernel
[LuxCore][4.997] [PathOCLBaseRenderThread::0] Kernels compilation time: 3637ms
[LuxCore][4.999] Film OpenCL image pipeline
[LuxCore][5.000] Film OpenCL Device used: GeForce GTX 1080 Intersect
Diff only for that scene compile speedup:
Code: Select all
diff --git a/include/slg/textures/texture_funcs.cl b/include/slg/textures/texture_funcs.cl
index 4f7f273..203173c 100644
--- a/include/slg/textures/texture_funcs.cl
+++ b/include/slg/textures/texture_funcs.cl
@@ -159,12 +159,12 @@ float3 FresnelApproxKTexture_ConstEvaluateSpectrum(__global HitPoint *hitPoint,
#if defined(PARAM_ENABLE_TEX_MIX)
-float MixTexture_ConstEvaluateFloat(__global HitPoint *hitPoint,
+__attribute__((noinline)) float MixTexture_ConstEvaluateFloat(__global HitPoint *hitPoint,
const float amt, const float value1, const float value2) {
return mix(value1, value2, clamp(amt, 0.f, 1.f));
}
-float3 MixTexture_ConstEvaluateSpectrum(__global HitPoint *hitPoint,
+__attribute__((noinline)) float3 MixTexture_ConstEvaluateSpectrum(__global HitPoint *hitPoint,
const float3 amt, const float3 value1, const float3 value2) {
return mix(value1, value2, clamp(amt, 0.f, 1.f));
}
@@ -177,7 +177,7 @@ float3 MixTexture_ConstEvaluateSpectrum(__global HitPoint *hitPoint,
#if defined(PARAM_ENABLE_CHECKERBOARD2D)
-float CheckerBoard2DTexture_ConstEvaluateFloat(__global HitPoint *hitPoint,
+__attribute__((noinline)) float CheckerBoard2DTexture_ConstEvaluateFloat(__global HitPoint *hitPoint,
const float value1, const float value2, __global const TextureMapping2D *mapping) {
const float2 uv = VLOAD2F(&hitPoint->uv.u);
const float2 mapUV = TextureMapping2D_Map(mapping, hitPoint);
@@ -185,7 +185,7 @@ float CheckerBoard2DTexture_ConstEvaluateFloat(__global HitPoint *hitPoint,
return ((Floor2Int(mapUV.s0) + Floor2Int(mapUV.s1)) % 2 == 0) ? value1 : value2;
}
-float3 CheckerBoard2DTexture_ConstEvaluateSpectrum(__global HitPoint *hitPoint,
+__attribute__((noinline)) float3 CheckerBoard2DTexture_ConstEvaluateSpectrum(__global HitPoint *hitPoint,
const float3 value1, const float3 value2, __global const TextureMapping2D *mapping) {
const float2 uv = VLOAD2F(&hitPoint->uv.u);
const float2 mapUV = TextureMapping2D_Map(mapping, hitPoint);
@@ -197,7 +197,7 @@ float3 CheckerBoard2DTexture_ConstEvaluateSpectrum(__global HitPoint *hitPoint,
#if defined(PARAM_ENABLE_CHECKERBOARD3D)
-float CheckerBoard3DTexture_ConstEvaluateFloat(__global HitPoint *hitPoint,
+__attribute__((noinline)) loat CheckerBoard3DTexture_ConstEvaluateFloat(__global HitPoint *hitPoint,
const float value1, const float value2, __global const TextureMapping3D *mapping) {
// The +DEFAULT_EPSILON_STATIC is there as workaround for planes placed exactly on 0.0
const float3 mapP = TextureMapping3D_Map(mapping, hitPoint) + + DEFAULT_EPSILON_STATIC;
@@ -205,7 +205,7 @@ float CheckerBoard3DTexture_ConstEvaluateFloat(__global HitPoint *hitPoint,
return ((Floor2Int(mapP.x) + Floor2Int(mapP.y) + Floor2Int(mapP.z)) % 2 == 0) ? value1 : value2;
}
-float3 CheckerBoard3DTexture_ConstEvaluateSpectrum(__global HitPoint *hitPoint,
+__attribute__((noinline)) float3 CheckerBoard3DTexture_ConstEvaluateSpectrum(__global HitPoint *hitPoint,
const float3 value1, const float3 value2, __global const TextureMapping3D *mapping) {
// The +DEFAULT_EPSILON_STATIC is there as workaround for planes placed exactly on 0.0
const float3 mapP = TextureMapping3D_Map(mapping, hitPoint) + + DEFAULT_EPSILON_STATIC;
Now the hercules work: understanding when to inline and when not, grrrr ....
As far as i see, one must go through all textures and check where inlining, noinlining or eventually #pragma unroll 1 ( == no unroll )
should be used.
Another succes:
Code: Select all
./bin/luxcoreui -D opencl.devices.select 1000 -D renderengine.type PATHOCL -D sampler.type SOBOL scenes/bump/bump-proc-mix.cfg
[LuxCore][6.839] [PathOCLBaseRenderThread::0] Kernels compilation time: 5867ms ( was > 10 minutes , i always aborted )
Got marble down from 65s to 2,3s ! Got succi down to 5s also.
I now understood the underlaying problem.
Empirically verified.
By doing selective inline my own musgrave color/bumptex is now from 15 to 12 secs per gpu.
Jens