@@ -57,35 +57,62 @@ __kernel void rcd_write_output (__write_only image2d_t out, global float *rgb0,
5757#define eps 1e-5f // Tolerance to avoid dividing by zero
5858#define epssq 1e-10f
5959
60- // Step 1.1: Calculate a squared vertical and horizontal high pass filter on color differences
61- __kernel void rcd_step_1_1 (global float * cfa , global float * v_diff , global float * h_diff , const int w , const int height )
60+ static inline float rcd_vdiff_local (local const float * buf , const int stride )
6261{
63- const int col = 3 + get_global_id (0 );
64- const int row = 3 + get_global_id (1 );
65- if ((row > height - 4 ) || (col > w - 4 )) return ;
66- const int idx = mad24 (row , w , col );
67- const int w2 = 2 * w ;
68- const int w3 = 3 * w ;
62+ return fsquare (buf [-3 * stride ] - buf [- stride ] - buf [stride ] + buf [3 * stride ] - 3.0f * (buf [-2 * stride ] + buf [2 * stride ]) + 6.0f * buf [0 ]);
63+ }
6964
70- v_diff [idx ] = fsquare (cfa [idx - w3 ] - 3.0f * cfa [idx - w2 ] - cfa [idx - w ] + 6.0f * cfa [idx ] - cfa [idx + w ] - 3.0f * cfa [idx + w2 ] + cfa [idx + w3 ]);
71- h_diff [idx ] = fsquare (cfa [idx - 3 ] - 3.0f * cfa [idx - 2 ] - cfa [idx - 1 ] + 6.0f * cfa [idx ] - cfa [idx + 1 ] - 3.0f * cfa [idx + 2 ] + cfa [idx + 3 ]);
65+ static inline float rcd_hdiff_local (local const float * buf )
66+ {
67+ return fsquare (buf [-3 ] - buf [-1 ] - buf [1 ] + buf [3 ] - 3.0f * (buf [-2 ] + buf [2 ]) + 6.0f * buf [0 ]);
7268}
7369
74- // Step 1.2: Calculate vertical and horizontal local discrimination
75- __kernel void rcd_step_1_2 (global float * VH_dir , global float * v_diff , global float * h_diff , const int w , const int height )
70+ // Step 1.1 + 1.2: preload one CFA tile and derive the directional discrimination locally
71+ // so we avoid materializing two full-frame high-pass buffers in global memory.
72+ // helpers and rcd_step_1 from ansel code @aurelienpierre
73+ __kernel void rcd_step_1 (global float * cfa , global float * VH_dir , const int w , const int height , local float * buffer )
7674{
75+ const int xlsz = get_local_size (0 );
76+ const int ylsz = get_local_size (1 );
77+ const int xlid = get_local_id (0 );
78+ const int ylid = get_local_id (1 );
79+ const int xgid = get_group_id (0 );
80+ const int ygid = get_group_id (1 );
81+ const int l = mad24 (ylid , xlsz , xlid );
82+ const int lsz = mul24 (xlsz , ylsz );
83+ const int stride = xlsz + 8 ;
84+ const int maxbuf = mul24 (stride , ylsz + 8 );
85+ const int xul = mul24 (xgid , xlsz ) - 2 ;
86+ const int yul = mul24 (ygid , ylsz ) - 2 ;
87+
88+ for (int n = 0 ; n <= maxbuf / lsz ; n ++ )
89+ {
90+ const int bufidx = mad24 (n , lsz , l );
91+ if (bufidx >= maxbuf ) continue ;
92+ const int xx = clamp (xul + bufidx % stride , 0 , w - 1 );
93+ const int yy = clamp (yul + bufidx / stride , 0 , height - 1 );
94+ buffer [bufidx ] = cfa [mad24 (yy , w , xx )];
95+ }
96+
97+ barrier (CLK_LOCAL_MEM_FENCE );
98+
7799 const int col = 2 + get_global_id (0 );
78100 const int row = 2 + get_global_id (1 );
79101 if ((row > height - 3 ) || (col > w - 3 )) return ;
80102 const int idx = mad24 (row , w , col );
81-
82- const float V_Stat = fmax (epssq , v_diff [idx - w ] + v_diff [idx ] + v_diff [idx + w ]);
83- const float H_Stat = fmax (epssq , h_diff [idx - 1 ] + h_diff [idx ] + h_diff [idx + 1 ]);
103+ local const float * buf = buffer + mad24 (ylid + 4 , stride , xlid + 4 );
104+
105+ const float V_Stat = fmax (epssq , rcd_vdiff_local (buf - stride , stride )
106+ + rcd_vdiff_local (buf , stride )
107+ + rcd_vdiff_local (buf + stride , stride ));
108+ const float H_Stat = fmax (epssq , rcd_hdiff_local (buf - 1 )
109+ + rcd_hdiff_local (buf )
110+ + rcd_hdiff_local (buf + 1 ));
84111 VH_dir [idx ] = V_Stat / (V_Stat + H_Stat );
85112}
86113
87- // Step 2.1 : Low pass filter incorporating green, red and blue local samples from the raw data
88- __kernel void rcd_step_2_1 (global float * lpf , global float * cfa , const int w , const int height , const unsigned int filters )
114+ // Step 2: Low pass filter incorporating green, red and blue local samples from the raw data
115+ __kernel void rcd_step_2 (global float * lpf , global float * cfa , const int w , const int height , const unsigned int filters )
89116{
90117 const int row = 2 + get_global_id (1 );
91118 const int col = 2 + (FC (row , 0 , filters ) & 1 ) + 2 * get_global_id (0 );
@@ -97,8 +124,8 @@ __kernel void rcd_step_2_1(global float *lpf, global float *cfa, const int w, co
97124 + 0.25f * (cfa [idx - w - 1 ] + cfa [idx - w + 1 ] + cfa [idx + w - 1 ] + cfa [idx + w + 1 ]);
98125}
99126
100- // Step 3.1 : Populate the green channel at blue and red CFA positions
101- __kernel void rcd_step_3_1 (global float * lpf , global float * cfa , global float * rgb1 , global float * VH_Dir , const int w , const int height , const unsigned int filters )
127+ // Step 3: Populate the green channel at blue and red CFA positions
128+ __kernel void rcd_step_3 (global float * lpf , global float * cfa , global float * rgb1 , global float * VH_Dir , const int w , const int height , const unsigned int filters )
102129{
103130 const int row = 4 + get_global_id (1 );
104131 const int col = 4 + (FC (row , 0 , filters ) & 1 ) + 2 * get_global_id (0 );
@@ -133,11 +160,11 @@ __kernel void rcd_step_3_1(global float *lpf, global float *cfa, global float *r
133160 const float H_Est = (W_Grad * E_Est + E_Grad * W_Est ) / (E_Grad + W_Grad );
134161
135162 // G@B and G@R interpolation
136- rgb1 [idx ] = mix (V_Est , H_Est , VH_Disc );
163+ rgb1 [idx ] = mix (V_Est , H_Est , clipf ( VH_Disc ) );
137164}
138165
139166// Step 4.0: Calculate the square of the P/Q diagonals color difference high pass filter
140- __kernel void rcd_step_4_1 (global float * cfa , global float * p_diff , global float * q_diff , const int w , const int height , const unsigned int filters )
167+ __kernel void rcd_step_4_0 (global float * cfa , global float * p_diff , global float * q_diff , const int w , const int height , const unsigned int filters )
141168{
142169 const int row = 3 + get_global_id (1 );
143170 const int col = 3 + 2 * get_global_id (0 );
@@ -152,7 +179,7 @@ __kernel void rcd_step_4_1(global float *cfa, global float *p_diff, global float
152179}
153180
154181// Step 4.1: Calculate P/Q diagonals local discrimination strength
155- __kernel void rcd_step_4_2 (global float * PQ_dir , global float * p_diff , global float * q_diff , const int w , const int height , const unsigned int filters )
182+ __kernel void rcd_step_4_1 (global float * PQ_dir , global float * p_diff , global float * q_diff , const int w , const int height , const unsigned int filters )
156183{
157184 const int row = 2 + get_global_id (1 );
158185 const int col = 2 + (FC (row , 0 , filters ) & 1 ) + 2 * get_global_id (0 );
@@ -168,7 +195,7 @@ __kernel void rcd_step_4_2(global float *PQ_dir, global float *p_diff, global fl
168195}
169196
170197// Step 4.2: Populate the red and blue channels at blue and red CFA positions
171- __kernel void rcd_step_5_1 (global float * PQ_dir , global float * rgb0 , global float * rgb1 , global float * rgb2 , const int w , const int height , const unsigned int filters )
198+ __kernel void rcd_step_4_2 (global float * PQ_dir , global float * rgb0 , global float * rgb1 , global float * rgb2 , const int w , const int height , const unsigned int filters )
172199{
173200 const int row = 4 + get_global_id (1 );
174201 const int col = 4 + (FC (row , 0 , filters ) & 1 ) + 2 * get_global_id (0 );
@@ -204,11 +231,11 @@ __kernel void rcd_step_5_1(global float *PQ_dir, global float *rgb0, global floa
204231 const float P_Est = (NW_Grad * SE_Est + SE_Grad * NW_Est ) / (NW_Grad + SE_Grad );
205232 const float Q_Est = (NE_Grad * SW_Est + SW_Grad * NE_Est ) / (NE_Grad + SW_Grad );
206233
207- rgbc [idx ]= rgb1 [idx ] + mix (P_Est , Q_Est , PQ_Disc );
234+ rgbc [idx ]= rgb1 [idx ] + mix (P_Est , Q_Est , clipf ( PQ_Disc ) );
208235}
209236
210237// Step 4.3: Populate the red and blue channels at green CFA positions
211- __kernel void rcd_step_5_2 (global float * VH_dir , global float * rgb0 , global float * rgb1 , global float * rgb2 , const int w , const int height , const unsigned int filters )
238+ __kernel void rcd_step_4_3 (global float * VH_dir , global float * rgb0 , global float * rgb1 , global float * rgb2 , const int w , const int height , const unsigned int filters )
212239{
213240 const int row = 4 + get_global_id (1 );
214241 const int col = 4 + (FC (row , 1 , filters ) & 1 ) + 2 * get_global_id (0 );
@@ -259,7 +286,7 @@ __kernel void rcd_step_5_2(global float *VH_dir, global float *rgb0, global floa
259286 const float H_Est = (E_Grad * W_Est + W_Grad * E_Est ) / (E_Grad + W_Grad );
260287
261288 // R@G and B@G interpolation
262- rgbc [idx ] = rgb1 [idx ] + mix (V_Est , H_Est , VH_Disc );
289+ rgbc [idx ] = rgb1 [idx ] + mix (V_Est , H_Est , clipf ( VH_Disc ) );
263290 }
264291}
265292
0 commit comments