Skip to content

Commit f05600d

Browse files
authored
Merge pull request #20811 from jenshannoschwalm/various_fixes_and_maintenance
Various fixes and maintenance
2 parents ec8d1ce + f4305ed commit f05600d

27 files changed

+209
-191
lines changed

data/kernels/basic.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3264,7 +3264,7 @@ colorzones_v3 (read_only image2d_t in,
32643264
default:
32653265
case DT_IOP_COLORZONES_h:
32663266
select = h;
3267-
blend = dtcl_pow(1.0f - C/128.0f, 2.0f);
3267+
blend = fsquare(1.0f - C/128.0f);
32683268
break;
32693269
}
32703270

data/kernels/capture.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ __kernel void prepare_blend(__read_only image2d_t cfa,
205205
if(row > 1 && col > 1 && (row < height-2) && (col < w -2))
206206
{
207207
const int w2 = 2 * w;
208-
const int color = (filters == 9u) ? FCxtrans(row, col, xtrans) : FC(row, col, filters);
208+
const int color = fcol(row, col, filters, xtrans);
209209
const float val = Areadsingle(cfa, col, row);
210210
if(val > whites[color] || Y < CAPTURE_YMIN)
211211
{

data/kernels/colorreconstruction.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ colorreconstruction_splat(
9393
switch(precedence)
9494
{
9595
case COLORRECONSTRUCT_PRECEDENCE_CHROMA:
96-
weight = sqrt(pixel.y * pixel.y + pixel.z * pixel.z);
96+
weight = dt_fast_hypot(pixel.y, pixel.z);
9797
break;
9898

9999
case COLORRECONSTRUCT_PRECEDENCE_HUE:

data/kernels/demosaic_rcd.cl

Lines changed: 53 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -57,35 +57,62 @@ __kernel void rcd_write_output (__write_only image2d_t out, global float *rgb0,
5757
#define eps 1e-5f // Tolerance to avoid dividing by zero
5858
#define epssq 1e-10f
5959

60-
// Step 1.1: Calculate a squared vertical and horizontal high pass filter on color differences
61-
__kernel void rcd_step_1_1 (global float *cfa, global float *v_diff, global float *h_diff, const int w, const int height)
60+
static inline float rcd_vdiff_local(local const float *buf, const int stride)
6261
{
63-
const int col = 3 + get_global_id(0);
64-
const int row = 3 + get_global_id(1);
65-
if((row > height - 4) || (col > w - 4)) return;
66-
const int idx = mad24(row, w, col);
67-
const int w2 = 2 * w;
68-
const int w3 = 3 * w;
62+
return fsquare(buf[-3 * stride] - buf[-stride] - buf[stride] + buf[3 * stride] - 3.0f *(buf[-2 * stride] + buf[2 * stride]) + 6.0f * buf[0]);
63+
}
6964

70-
v_diff[idx] = fsquare(cfa[idx - w3] - 3.0f * cfa[idx - w2] - cfa[idx - w] + 6.0f * cfa[idx] - cfa[idx + w] - 3.0f * cfa[idx + w2] + cfa[idx + w3]);
71-
h_diff[idx] = fsquare(cfa[idx - 3] - 3.0f * cfa[idx - 2] - cfa[idx - 1] + 6.0f * cfa[idx] - cfa[idx + 1] - 3.0f * cfa[idx + 2] + cfa[idx + 3]);
65+
static inline float rcd_hdiff_local(local const float *buf)
66+
{
67+
return fsquare(buf[-3] - buf[-1] - buf[1] + buf[3] - 3.0f *(buf[-2] + buf[2]) + 6.0f * buf[0]);
7268
}
7369

74-
// Step 1.2: Calculate vertical and horizontal local discrimination
75-
__kernel void rcd_step_1_2 (global float *VH_dir, global float *v_diff, global float *h_diff, const int w, const int height)
70+
// Step 1.1 + 1.2: preload one CFA tile and derive the directional discrimination locally
71+
// so we avoid materializing two full-frame high-pass buffers in global memory.
72+
// helpers and rcd_step_1 from ansel code @aurelienpierre
73+
__kernel void rcd_step_1(global float *cfa, global float *VH_dir, const int w, const int height, local float *buffer)
7674
{
75+
const int xlsz = get_local_size(0);
76+
const int ylsz = get_local_size(1);
77+
const int xlid = get_local_id(0);
78+
const int ylid = get_local_id(1);
79+
const int xgid = get_group_id(0);
80+
const int ygid = get_group_id(1);
81+
const int l = mad24(ylid, xlsz, xlid);
82+
const int lsz = mul24(xlsz, ylsz);
83+
const int stride = xlsz + 8;
84+
const int maxbuf = mul24(stride, ylsz + 8);
85+
const int xul = mul24(xgid, xlsz) - 2;
86+
const int yul = mul24(ygid, ylsz) - 2;
87+
88+
for(int n = 0; n <= maxbuf / lsz; n++)
89+
{
90+
const int bufidx = mad24(n, lsz, l);
91+
if(bufidx >= maxbuf) continue;
92+
const int xx = clamp(xul + bufidx % stride, 0, w - 1);
93+
const int yy = clamp(yul + bufidx / stride, 0, height - 1);
94+
buffer[bufidx] = cfa[mad24(yy, w, xx)];
95+
}
96+
97+
barrier(CLK_LOCAL_MEM_FENCE);
98+
7799
const int col = 2 + get_global_id(0);
78100
const int row = 2 + get_global_id(1);
79101
if((row > height - 3) || (col > w - 3)) return;
80102
const int idx = mad24(row, w, col);
81-
82-
const float V_Stat = fmax(epssq, v_diff[idx - w] + v_diff[idx] + v_diff[idx + w]);
83-
const float H_Stat = fmax(epssq, h_diff[idx - 1] + h_diff[idx] + h_diff[idx + 1]);
103+
local const float *buf = buffer + mad24(ylid + 4, stride, xlid + 4);
104+
105+
const float V_Stat = fmax(epssq, rcd_vdiff_local(buf - stride, stride)
106+
+ rcd_vdiff_local(buf, stride)
107+
+ rcd_vdiff_local(buf + stride, stride));
108+
const float H_Stat = fmax(epssq, rcd_hdiff_local(buf - 1)
109+
+ rcd_hdiff_local(buf)
110+
+ rcd_hdiff_local(buf + 1));
84111
VH_dir[idx] = V_Stat / (V_Stat + H_Stat);
85112
}
86113

87-
// Step 2.1: Low pass filter incorporating green, red and blue local samples from the raw data
88-
__kernel void rcd_step_2_1(global float *lpf, global float *cfa, const int w, const int height, const unsigned int filters)
114+
// Step 2: Low pass filter incorporating green, red and blue local samples from the raw data
115+
__kernel void rcd_step_2(global float *lpf, global float *cfa, const int w, const int height, const unsigned int filters)
89116
{
90117
const int row = 2 + get_global_id(1);
91118
const int col = 2 + (FC(row, 0, filters) & 1) + 2 *get_global_id(0);
@@ -97,8 +124,8 @@ __kernel void rcd_step_2_1(global float *lpf, global float *cfa, const int w, co
97124
+ 0.25f * (cfa[idx - w - 1] + cfa[idx - w + 1] + cfa[idx + w - 1] + cfa[idx + w + 1]);
98125
}
99126

100-
// Step 3.1: Populate the green channel at blue and red CFA positions
101-
__kernel void rcd_step_3_1(global float *lpf, global float *cfa, global float *rgb1, global float *VH_Dir, const int w, const int height, const unsigned int filters)
127+
// Step 3: Populate the green channel at blue and red CFA positions
128+
__kernel void rcd_step_3(global float *lpf, global float *cfa, global float *rgb1, global float *VH_Dir, const int w, const int height, const unsigned int filters)
102129
{
103130
const int row = 4 + get_global_id(1);
104131
const int col = 4 + (FC(row, 0, filters) & 1) + 2 * get_global_id(0);
@@ -133,11 +160,11 @@ __kernel void rcd_step_3_1(global float *lpf, global float *cfa, global float *r
133160
const float H_Est = (W_Grad * E_Est + E_Grad * W_Est) / (E_Grad + W_Grad);
134161

135162
// G@B and G@R interpolation
136-
rgb1[idx] = mix(V_Est, H_Est, VH_Disc);
163+
rgb1[idx] = mix(V_Est, H_Est, clipf(VH_Disc));
137164
}
138165

139166
// Step 4.0: Calculate the square of the P/Q diagonals color difference high pass filter
140-
__kernel void rcd_step_4_1(global float *cfa, global float *p_diff, global float *q_diff, const int w, const int height, const unsigned int filters)
167+
__kernel void rcd_step_4_0(global float *cfa, global float *p_diff, global float *q_diff, const int w, const int height, const unsigned int filters)
141168
{
142169
const int row = 3 + get_global_id(1);
143170
const int col = 3 + 2 * get_global_id(0);
@@ -152,7 +179,7 @@ __kernel void rcd_step_4_1(global float *cfa, global float *p_diff, global float
152179
}
153180

154181
// Step 4.1: Calculate P/Q diagonals local discrimination strength
155-
__kernel void rcd_step_4_2(global float *PQ_dir, global float *p_diff, global float *q_diff, const int w, const int height, const unsigned int filters)
182+
__kernel void rcd_step_4_1(global float *PQ_dir, global float *p_diff, global float *q_diff, const int w, const int height, const unsigned int filters)
156183
{
157184
const int row = 2 + get_global_id(1);
158185
const int col = 2 + (FC(row, 0, filters) & 1) + 2 *get_global_id(0);
@@ -168,7 +195,7 @@ __kernel void rcd_step_4_2(global float *PQ_dir, global float *p_diff, global fl
168195
}
169196

170197
// Step 4.2: Populate the red and blue channels at blue and red CFA positions
171-
__kernel void rcd_step_5_1(global float *PQ_dir, global float *rgb0, global float *rgb1, global float *rgb2, const int w, const int height, const unsigned int filters)
198+
__kernel void rcd_step_4_2(global float *PQ_dir, global float *rgb0, global float *rgb1, global float *rgb2, const int w, const int height, const unsigned int filters)
172199
{
173200
const int row = 4 + get_global_id(1);
174201
const int col = 4 + (FC(row, 0, filters) & 1) + 2 * get_global_id(0);
@@ -204,11 +231,11 @@ __kernel void rcd_step_5_1(global float *PQ_dir, global float *rgb0, global floa
204231
const float P_Est = (NW_Grad * SE_Est + SE_Grad * NW_Est) / (NW_Grad + SE_Grad);
205232
const float Q_Est = (NE_Grad * SW_Est + SW_Grad * NE_Est) / (NE_Grad + SW_Grad);
206233

207-
rgbc[idx]= rgb1[idx] + mix(P_Est, Q_Est, PQ_Disc);
234+
rgbc[idx]= rgb1[idx] + mix(P_Est, Q_Est, clipf(PQ_Disc));
208235
}
209236

210237
// Step 4.3: Populate the red and blue channels at green CFA positions
211-
__kernel void rcd_step_5_2(global float *VH_dir, global float *rgb0, global float *rgb1, global float *rgb2, const int w, const int height, const unsigned int filters)
238+
__kernel void rcd_step_4_3(global float *VH_dir, global float *rgb0, global float *rgb1, global float *rgb2, const int w, const int height, const unsigned int filters)
212239
{
213240
const int row = 4 + get_global_id(1);
214241
const int col = 4 + (FC(row, 1, filters) & 1) + 2 * get_global_id(0);
@@ -259,7 +286,7 @@ __kernel void rcd_step_5_2(global float *VH_dir, global float *rgb0, global floa
259286
const float H_Est = (E_Grad * W_Est + W_Grad * E_Est) / (E_Grad + W_Grad);
260287

261288
// R@G and B@G interpolation
262-
rgbc[idx] = rgb1[idx] + mix(V_Est, H_Est, VH_Disc);
289+
rgbc[idx] = rgb1[idx] + mix(V_Est, H_Est, clipf(VH_Disc));
263290
}
264291
}
265292

data/kernels/demosaic_vng.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ vng_interpolate(read_only image2d_t in,
177177
if(bufidx >= maxbuf) continue;
178178
const int xx = xul + bufidx % stride;
179179
const int yy = yul + bufidx / stride;
180-
const float4 pixel = fmax(0.0f, readpixel(in, xx, yy));
180+
const float4 pixel = readpixel(in, xx, yy);
181181
vstore4(pixel, bufidx, buffer);
182182
}
183183

data/kernels/extended.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -263,9 +263,9 @@ vibrance (read_only image2d_t in, write_only image2d_t out, const int width, con
263263

264264
if(x >= width || y >= height) return;
265265

266-
float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
266+
float4 pixel = readpixel(in, x, y);
267267

268-
const float sw = sqrt(pixel.y*pixel.y + pixel.z*pixel.z)/256.0f;
268+
const float sw = dt_fast_hypot(pixel.y, pixel.z)/256.0f;
269269
const float ls = 1.0f - amount * sw * 0.25f;
270270
const float ss = 1.0f + amount * sw;
271271

src/control/jobs/control_jobs.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -554,7 +554,7 @@ static int _control_merge_hdr_process(dt_imageio_module_data_t *datai,
554554
const float eap = image.exif_aperture > 0.0f ? image.exif_aperture : 22.0f;
555555
const float efl = image.exif_focal_length > 0.0f ? image.exif_focal_length : 8.0f;
556556
const float rad = .5f * efl / eap;
557-
const float aperture = M_PI * rad * rad;
557+
const float aperture = M_PI_F * rad * rad;
558558
const float iso = image.exif_iso > 0.0f ? image.exif_iso : 100.0f;
559559
const float exp = image.exif_exposure > 0.0f ? image.exif_exposure : 1.0f;
560560
const float cal = 100.0f / (aperture * exp * iso);

src/develop/blend.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@ static inline float _detail_mask_threshold(const float level,
252252
const gboolean detail)
253253
{
254254
// this does some range calculation for smoother ui experience
255-
return 0.005f * (detail ? powf(level, 2.0f) : 1.0f - powf(fabs(level), 0.5f ));
255+
return 0.005f * (detail ? sqrf(level) : 1.0f - sqrtf(fabs(level)));
256256
}
257257

258258
static void _refine_with_detail_mask(dt_iop_module_t *self,
@@ -1478,7 +1478,7 @@ void tiling_callback_blendop(dt_iop_module_t *self,
14781478
if(devid > DT_DEVICE_CPU)
14791479
{
14801480
/* OpenCL feathering does simple internal tiling for less mem pressure,
1481-
we still need some mem here for this.
1481+
we still need some mem here for this.
14821482
*/
14831483
tiling->factor_cl = MAX(tiling->factor, 1.0f);
14841484
}

src/develop/blends/blendif_lab.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1347,7 +1347,7 @@ static void _display_channel(const float *const restrict a,
13471347
}
13481348
case DT_DEV_PIXELPIPE_DISPLAY_LCH_C:
13491349
{
1350-
const float factor = 1.0f / (128.0f * sqrtf(2.0f) * exp2f(boost_factors[DEVELOP_BLENDIF_C_in]));
1350+
const float factor = 1.0f / (128.0f * M_SQRT2_F * exp2f(boost_factors[DEVELOP_BLENDIF_C_in]));
13511351
for(size_t i = 0, j = 0; i < stride; i++, j += DT_BLENDIF_LAB_CH)
13521352
{
13531353
dt_aligned_pixel_t LCH;

src/develop/masks/brush.c

Lines changed: 13 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ static void _brush_border_get_XY(const float p0x,
248248
*yb = DT_INVALID_COORDINATE;
249249
return;
250250
}
251-
const float l = 1.0f / sqrtf(dx * dx + dy * dy);
251+
const float l = 1.0f / dt_fast_hypotf(dx, dy);
252252
*xb = (*xc) + rad * dy * l;
253253
*yb = (*yc) - rad * dx * l;
254254
}
@@ -430,18 +430,16 @@ static void _brush_points_recurs_border_gaps(float *cmax,
430430
// we have to be sure that we turn in the correct direction
431431
if(a2 < a1 && clockwise)
432432
{
433-
a2 += 2.0f * M_PI;
433+
a2 += DT_2PI_F;
434434
}
435435
if(a2 > a1 && !clockwise)
436436
{
437-
a1 += 2.0f * M_PI;
437+
a1 += DT_2PI_F;
438438
}
439439

440440
// we determine start and end radius too
441-
float r1 = sqrtf((bmin[1] - cmax[1]) * (bmin[1] - cmax[1])
442-
+ (bmin[0] - cmax[0]) * (bmin[0] - cmax[0]));
443-
float r2 = sqrtf((bmax[1] - cmax[1]) * (bmax[1] - cmax[1])
444-
+ (bmax[0] - cmax[0]) * (bmax[0] - cmax[0]));
441+
float r1 = dt_fast_hypotf(bmin[1] - cmax[1], bmin[0] - cmax[0]);
442+
float r2 = dt_fast_hypotf(bmax[1] - cmax[1], bmax[0] - cmax[0]);
445443

446444
// and the max length of the circle arc
447445
int l;
@@ -489,21 +487,19 @@ static void _brush_points_recurs_border_small_gaps(float *cmax,
489487
{
490488
// we want to find the start and end angles
491489
const float a1 = fmodf(atan2f(bmin[1] - cmax[1], bmin[0] - cmax[0])
492-
+ 2.0f * M_PI, 2.0f * M_PI);
490+
+ DT_2PI_F, DT_2PI_F);
493491
const float a2 = fmodf(atan2f(bmax[1] - cmax[1], bmax[0] - cmax[0])
494-
+ 2.0f * M_PI, 2.0f * M_PI);
492+
+ DT_2PI_F, DT_2PI_F);
495493

496494
if(a1 == a2) return;
497495

498496
// we determine start and end radius too
499-
const float r1 = sqrtf((bmin[1] - cmax[1]) * (bmin[1] - cmax[1])
500-
+ (bmin[0] - cmax[0]) * (bmin[0] - cmax[0]));
501-
const float r2 = sqrtf((bmax[1] - cmax[1]) * (bmax[1] - cmax[1])
502-
+ (bmax[0] - cmax[0]) * (bmax[0] - cmax[0]));
497+
const float r1 = dt_fast_hypotf(bmin[1] - cmax[1], bmin[0] - cmax[0]);
498+
const float r2 = dt_fast_hypotf(bmax[1] - cmax[1], bmax[0] - cmax[0]);
503499

504500
// we close the gap in the shortest direction
505501
float delta = a2 - a1;
506-
if(fabsf(delta) > M_PI) delta = delta - copysignf(2.0f * M_PI, delta);
502+
if(fabsf(delta) > M_PI_F) delta = delta - copysignf(DT_2PI_F, delta);
507503

508504
// get the max length of the circle arc
509505
const int l = fabsf(delta) * fmaxf(r1, r2);
@@ -547,15 +543,14 @@ static void _brush_points_stamp(float *cmax,
547543
const float a1 = atan2f(bmin[1] - cmax[1], bmin[0] - cmax[0]);
548544

549545
// we determine the radius too
550-
const float rad = sqrtf((bmin[1] - cmax[1]) * (bmin[1] - cmax[1])
551-
+ (bmin[0] - cmax[0]) * (bmin[0] - cmax[0]));
546+
const float rad = dt_fast_hypotf(bmin[1] - cmax[1], bmin[0] - cmax[0]);
552547

553548
// determine the max length of the circle arc
554549
const int l = 2.0f * M_PI * rad;
555550
if(l < 2) return;
556551

557552
// and now we add the points
558-
const float incra = 2.0f * M_PI / l;
553+
const float incra = DT_2PI_F / l;
559554
float aa = a1 + incra;
560555
// allocate entries in the dynbufs
561556
float *dpoints_ptr = dt_masks_dynbuf_reserve_n(dpoints, 2*(l-1));
@@ -2273,7 +2268,7 @@ static int _brush_events_mouse_moved(struct dt_iop_module_t *module,
22732268
dt_masks_point_brush_t *point = g_list_nth_data(form->points, k);
22742269
const float nx = point->corner[0] * iwidth;
22752270
const float ny = point->corner[1] * iheight;
2276-
const float nr = sqrtf((pts[0] - nx) * (pts[0] - nx) + (pts[1] - ny) * (pts[1] - ny));
2271+
const float nr = dt_fast_hypotf(pts[0] - nx, pts[1] - ny);
22772272
const float bdr = nr / fminf(iwidth, iheight);
22782273

22792274
point->border[0] = point->border[1] = bdr;

0 commit comments

Comments
 (0)