Line data Source code
1 : /******************************************************************************
2 : * $Id$
3 : *
4 : * Project: OpenCL Image Reprojector
5 : * Purpose: Implementation of the GDALWarpKernel reprojector in OpenCL.
6 : * Author: Seth Price, seth@pricepages.org
7 : *
8 : ******************************************************************************
9 : * Copyright (c) 2010, Seth Price <seth@pricepages.org>
10 : * Copyright (c) 2010-2012, Even Rouault <even dot rouault at spatialys.com>
11 : *
12 : * Permission is hereby granted, free of charge, to any person obtaining a
13 : * copy of this software and associated documentation files (the "Software"),
14 : * to deal in the Software without restriction, including without limitation
15 : * the rights to use, copy, modify, merge, publish, distribute, sublicense,
16 : * and/or sell copies of the Software, and to permit persons to whom the
17 : * Software is furnished to do so, subject to the following conditions:
18 : *
19 : * The above copyright notice and this permission notice shall be included
20 : * in all copies or substantial portions of the Software.
21 : *
22 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
23 : * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 : * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
25 : * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 : * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
27 : * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
28 : * DEALINGS IN THE SOFTWARE.
29 : ****************************************************************************/
30 :
31 : #include <algorithm>
32 : #if defined(HAVE_OPENCL)
33 :
34 : /* The following line may be uncommented for increased debugging traces and
35 : * profiling */
36 : /* #define DEBUG_OPENCL 1 */
37 :
38 : #include <assert.h>
39 : #include <stdlib.h>
40 : #include <stdio.h>
41 : #include <limits.h>
42 : #include <float.h>
43 : #include <limits>
44 : #include <vector>
45 : #include "cpl_string.h"
46 : #include "gdalwarpkernel_opencl.h"
47 :
48 : #define handleErr(err) \
49 : do \
50 : { \
51 : if ((err) != CL_SUCCESS) \
52 : { \
53 : CPLError(CE_Failure, CPLE_AppDefined, \
54 : "Error at file %s line %d: %s", __FILE__, __LINE__, \
55 : getCLErrorString(err)); \
56 : return err; \
57 : } \
58 : } while (0)
59 :
60 : #define handleErrRetNULL(err) \
61 : do \
62 : { \
63 : if ((err) != CL_SUCCESS) \
64 : { \
65 : (*clErr) = err; \
66 : CPLError(CE_Failure, CPLE_AppDefined, \
67 : "Error at file %s line %d: %s", __FILE__, __LINE__, \
68 : getCLErrorString(err)); \
69 : return nullptr; \
70 : } \
71 : } while (0)
72 :
73 : #define handleErrGoto(err, goto_label) \
74 : do \
75 : { \
76 : if ((err) != CL_SUCCESS) \
77 : { \
78 : (*clErr) = err; \
79 : CPLError(CE_Failure, CPLE_AppDefined, \
80 : "Error at file %s line %d: %s", __FILE__, __LINE__, \
81 : getCLErrorString(err)); \
82 : goto goto_label; \
83 : } \
84 : } while (0)
85 :
86 : #define freeCLMem(clMem, fallBackMem) \
87 : do \
88 : { \
89 : if ((clMem) != nullptr) \
90 : { \
91 : handleErr(err = clReleaseMemObject(clMem)); \
92 : clMem = nullptr; \
93 : fallBackMem = nullptr; \
94 : } \
95 : else if ((fallBackMem) != nullptr) \
96 : { \
97 : CPLFree(fallBackMem); \
98 : fallBackMem = nullptr; \
99 : } \
100 : } while (false)
101 :
102 0 : static const char *getCLErrorString(cl_int err)
103 : {
104 0 : switch (err)
105 : {
106 0 : case CL_SUCCESS:
107 0 : return ("CL_SUCCESS");
108 : break;
109 0 : case CL_DEVICE_NOT_FOUND:
110 0 : return ("CL_DEVICE_NOT_FOUND");
111 : break;
112 0 : case CL_DEVICE_NOT_AVAILABLE:
113 0 : return ("CL_DEVICE_NOT_AVAILABLE");
114 : break;
115 0 : case CL_COMPILER_NOT_AVAILABLE:
116 0 : return ("CL_COMPILER_NOT_AVAILABLE");
117 : break;
118 0 : case CL_MEM_OBJECT_ALLOCATION_FAILURE:
119 0 : return ("CL_MEM_OBJECT_ALLOCATION_FAILURE");
120 : break;
121 0 : case CL_OUT_OF_RESOURCES:
122 0 : return ("CL_OUT_OF_RESOURCES");
123 : break;
124 0 : case CL_OUT_OF_HOST_MEMORY:
125 0 : return ("CL_OUT_OF_HOST_MEMORY");
126 : break;
127 0 : case CL_PROFILING_INFO_NOT_AVAILABLE:
128 0 : return ("CL_PROFILING_INFO_NOT_AVAILABLE");
129 : break;
130 0 : case CL_MEM_COPY_OVERLAP:
131 0 : return ("CL_MEM_COPY_OVERLAP");
132 : break;
133 0 : case CL_IMAGE_FORMAT_MISMATCH:
134 0 : return ("CL_IMAGE_FORMAT_MISMATCH");
135 : break;
136 0 : case CL_IMAGE_FORMAT_NOT_SUPPORTED:
137 0 : return ("CL_IMAGE_FORMAT_NOT_SUPPORTED");
138 : break;
139 0 : case CL_BUILD_PROGRAM_FAILURE:
140 0 : return ("CL_BUILD_PROGRAM_FAILURE");
141 : break;
142 0 : case CL_MAP_FAILURE:
143 0 : return ("CL_MAP_FAILURE");
144 : break;
145 0 : case CL_INVALID_VALUE:
146 0 : return ("CL_INVALID_VALUE");
147 : break;
148 0 : case CL_INVALID_DEVICE_TYPE:
149 0 : return ("CL_INVALID_DEVICE_TYPE");
150 : break;
151 0 : case CL_INVALID_PLATFORM:
152 0 : return ("CL_INVALID_PLATFORM");
153 : break;
154 0 : case CL_INVALID_DEVICE:
155 0 : return ("CL_INVALID_DEVICE");
156 : break;
157 0 : case CL_INVALID_CONTEXT:
158 0 : return ("CL_INVALID_CONTEXT");
159 : break;
160 0 : case CL_INVALID_QUEUE_PROPERTIES:
161 0 : return ("CL_INVALID_QUEUE_PROPERTIES");
162 : break;
163 0 : case CL_INVALID_COMMAND_QUEUE:
164 0 : return ("CL_INVALID_COMMAND_QUEUE");
165 : break;
166 0 : case CL_INVALID_HOST_PTR:
167 0 : return ("CL_INVALID_HOST_PTR");
168 : break;
169 0 : case CL_INVALID_MEM_OBJECT:
170 0 : return ("CL_INVALID_MEM_OBJECT");
171 : break;
172 0 : case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
173 0 : return ("CL_INVALID_IMAGE_FORMAT_DESCRIPTOR");
174 : break;
175 0 : case CL_INVALID_IMAGE_SIZE:
176 0 : return ("CL_INVALID_IMAGE_SIZE");
177 : break;
178 0 : case CL_INVALID_SAMPLER:
179 0 : return ("CL_INVALID_SAMPLER");
180 : break;
181 0 : case CL_INVALID_BINARY:
182 0 : return ("CL_INVALID_BINARY");
183 : break;
184 0 : case CL_INVALID_BUILD_OPTIONS:
185 0 : return ("CL_INVALID_BUILD_OPTIONS");
186 : break;
187 0 : case CL_INVALID_PROGRAM:
188 0 : return ("CL_INVALID_PROGRAM");
189 : break;
190 0 : case CL_INVALID_PROGRAM_EXECUTABLE:
191 0 : return ("CL_INVALID_PROGRAM_EXECUTABLE");
192 : break;
193 0 : case CL_INVALID_KERNEL_NAME:
194 0 : return ("CL_INVALID_KERNEL_NAME");
195 : break;
196 0 : case CL_INVALID_KERNEL_DEFINITION:
197 0 : return ("CL_INVALID_KERNEL_DEFINITION");
198 : break;
199 0 : case CL_INVALID_KERNEL:
200 0 : return ("CL_INVALID_KERNEL");
201 : break;
202 0 : case CL_INVALID_ARG_INDEX:
203 0 : return ("CL_INVALID_ARG_INDEX");
204 : break;
205 0 : case CL_INVALID_ARG_VALUE:
206 0 : return ("CL_INVALID_ARG_VALUE");
207 : break;
208 0 : case CL_INVALID_ARG_SIZE:
209 0 : return ("CL_INVALID_ARG_SIZE");
210 : break;
211 0 : case CL_INVALID_KERNEL_ARGS:
212 0 : return ("CL_INVALID_KERNEL_ARGS");
213 : break;
214 0 : case CL_INVALID_WORK_DIMENSION:
215 0 : return ("CL_INVALID_WORK_DIMENSION");
216 : break;
217 0 : case CL_INVALID_WORK_GROUP_SIZE:
218 0 : return ("CL_INVALID_WORK_GROUP_SIZE");
219 : break;
220 0 : case CL_INVALID_WORK_ITEM_SIZE:
221 0 : return ("CL_INVALID_WORK_ITEM_SIZE");
222 : break;
223 0 : case CL_INVALID_GLOBAL_OFFSET:
224 0 : return ("CL_INVALID_GLOBAL_OFFSET");
225 : break;
226 0 : case CL_INVALID_EVENT_WAIT_LIST:
227 0 : return ("CL_INVALID_EVENT_WAIT_LIST");
228 : break;
229 0 : case CL_INVALID_EVENT:
230 0 : return ("CL_INVALID_EVENT");
231 : break;
232 0 : case CL_INVALID_OPERATION:
233 0 : return ("CL_INVALID_OPERATION");
234 : break;
235 0 : case CL_INVALID_GL_OBJECT:
236 0 : return ("CL_INVALID_GL_OBJECT");
237 : break;
238 0 : case CL_INVALID_BUFFER_SIZE:
239 0 : return ("CL_INVALID_BUFFER_SIZE");
240 : break;
241 0 : case CL_INVALID_MIP_LEVEL:
242 0 : return ("CL_INVALID_MIP_LEVEL");
243 : break;
244 0 : case CL_INVALID_GLOBAL_WORK_SIZE:
245 0 : return ("CL_INVALID_GLOBAL_WORK_SIZE");
246 : break;
247 : }
248 :
249 0 : return "unknown_error";
250 : }
251 :
252 0 : static const char *getCLDataTypeString(cl_channel_type dataType)
253 : {
254 0 : switch (dataType)
255 : {
256 0 : case CL_SNORM_INT8:
257 0 : return "CL_SNORM_INT8";
258 0 : case CL_SNORM_INT16:
259 0 : return "CL_SNORM_INT16";
260 0 : case CL_UNORM_INT8:
261 0 : return "CL_UNORM_INT8";
262 0 : case CL_UNORM_INT16:
263 0 : return "CL_UNORM_INT16";
264 : #if 0
265 : case CL_UNORM_SHORT_565: return "CL_UNORM_SHORT_565";
266 : case CL_UNORM_SHORT_555: return "CL_UNORM_SHORT_555";
267 : case CL_UNORM_INT_101010: return "CL_UNORM_INT_101010";
268 : case CL_SIGNED_INT8: return "CL_SIGNED_INT8";
269 : case CL_SIGNED_INT16: return "CL_SIGNED_INT16";
270 : case CL_SIGNED_INT32: return "CL_SIGNED_INT32";
271 : case CL_UNSIGNED_INT8: return "CL_UNSIGNED_INT8";
272 : case CL_UNSIGNED_INT16: return "CL_UNSIGNED_INT16";
273 : case CL_UNSIGNED_INT32: return "CL_UNSIGNED_INT32";
274 : case CL_HALF_FLOAT: return "CL_HALF_FLOAT";
275 : #endif
276 0 : case CL_FLOAT:
277 0 : return "CL_FLOAT";
278 0 : default:
279 0 : return "unknown";
280 : }
281 : }
282 :
283 : /*
284 : Finds an appropriate OpenCL device. For debugging, it's
285 : always easier to use CL_DEVICE_TYPE_CPU because then */
286 : /*ok*/ /*printf() can be called
287 : from the kernel. If debugging is on, we can print the name and stats about the
288 : device we're using.
289 : */
290 0 : static cl_device_id get_device(OCLVendor *peVendor)
291 : {
292 0 : cl_int err = 0;
293 0 : size_t returned_size = 0;
294 0 : cl_char vendor_name[1024] = {0};
295 0 : cl_char device_name[1024] = {0};
296 : cl_device_type device_type;
297 :
298 0 : std::vector<cl_platform_id> platforms;
299 0 : cl_uint num_platforms = 0;
300 :
301 : static bool gbBuggyOpenCL = false;
302 0 : if (gbBuggyOpenCL)
303 0 : return nullptr;
304 : try
305 : {
306 0 : err = clGetPlatformIDs(0, nullptr, &num_platforms);
307 0 : if (err != CL_SUCCESS || num_platforms == 0)
308 0 : return nullptr;
309 :
310 0 : platforms.resize(num_platforms);
311 0 : err = clGetPlatformIDs(num_platforms, &platforms[0], nullptr);
312 0 : if (err != CL_SUCCESS)
313 0 : return nullptr;
314 : }
315 0 : catch (...)
316 : {
317 0 : gbBuggyOpenCL = true;
318 0 : CPLDebug("OpenCL", "clGetPlatformIDs() threw a C++ exception");
319 : // This should normally not happen. But that does happen with
320 : // intel-opencl 0r2.0-54426 when run under xvfb-run
321 0 : return nullptr;
322 : }
323 :
324 : const bool bUseOpenCLCPU =
325 0 : CPLTestBool(CPLGetConfigOption("OPENCL_USE_CPU", "FALSE"));
326 :
327 : struct device_info
328 : {
329 : cl_device_id id;
330 : cl_device_type device_type;
331 : std::string vendor_name;
332 : std::string device_name;
333 : };
334 :
335 0 : std::vector<device_info> openCLdevices;
336 :
337 : // In case we have several implementations, pick up the non Intel one by
338 : // default, unless the PREFERRED_OPENCL_VENDOR config option is specified.
339 0 : if (num_platforms > 0)
340 : {
341 : const char *pszBlacklistedVendor =
342 0 : CPLGetConfigOption("BLACKLISTED_OPENCL_VENDOR", nullptr);
343 :
344 0 : for (cl_uint platformIndex = 0; platformIndex < num_platforms;
345 : platformIndex++)
346 : {
347 :
348 0 : constexpr int max_devices{20};
349 : cl_uint num_devices;
350 : cl_device_id devices[max_devices];
351 0 : err = clGetDeviceIDs(platforms[platformIndex],
352 : bUseOpenCLCPU
353 : ? CL_DEVICE_TYPE_CPU
354 : : CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU,
355 : max_devices, devices, &num_devices);
356 :
357 0 : if (err != CL_SUCCESS)
358 : {
359 0 : continue;
360 : }
361 :
362 0 : cl_device_id device = nullptr;
363 :
364 0 : for (cl_uint deviceIndex = 0; deviceIndex < num_devices;
365 : ++deviceIndex)
366 : {
367 :
368 0 : device = devices[deviceIndex];
369 0 : err = clGetDeviceInfo(device, CL_DEVICE_VENDOR,
370 : sizeof(vendor_name), vendor_name,
371 : &returned_size);
372 0 : err |=
373 0 : clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name),
374 : device_name, &returned_size);
375 0 : err |=
376 0 : clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(device_type),
377 : &device_type, &returned_size);
378 0 : assert(err == CL_SUCCESS);
379 0 : if (pszBlacklistedVendor &&
380 0 : EQUAL(reinterpret_cast<const char *>(vendor_name),
381 : pszBlacklistedVendor))
382 : {
383 0 : CPLDebug("OpenCL",
384 : "Blacklisted vendor='%s' / device='%s' "
385 : "implementation skipped",
386 : vendor_name, device_name);
387 : }
388 : else
389 : {
390 0 : CPLDebug(
391 : "OpenCL",
392 : "Found vendor='%s' / device='%s' (%s implementation).",
393 : vendor_name, device_name,
394 0 : (device_type == CL_DEVICE_TYPE_GPU) ? "GPU" : "CPU");
395 0 : openCLdevices.push_back(
396 : {device, device_type,
397 : reinterpret_cast<const char *>(vendor_name),
398 : reinterpret_cast<const char *>(device_name)});
399 : }
400 : }
401 : }
402 : }
403 :
404 0 : if (!openCLdevices.empty())
405 : {
406 : // Sort, GPU first, then preferred vendor, Intel comes last (unless is the preferred of course)
407 : const std::string preferredVendorName{
408 0 : CPLGetConfigOption("PREFERRED_OPENCL_VENDOR", "")};
409 0 : std::sort(
410 : openCLdevices.begin(), openCLdevices.end(),
411 0 : [&](const device_info first, const device_info second) -> int
412 : {
413 0 : return (first.device_type == CL_DEVICE_TYPE_GPU ? 4 : 0) +
414 0 : (first.vendor_name == preferredVendorName ? 2 : 0) +
415 0 : (first.vendor_name.find("Intel") == 0 ? 0 : 1) >
416 0 : (second.device_type == CL_DEVICE_TYPE_GPU ? 4 : 0) +
417 0 : (second.vendor_name == preferredVendorName ? 2 : 0) +
418 0 : (second.vendor_name.find("Intel") == 0 ? 0 : 1);
419 : });
420 : }
421 : else
422 : {
423 0 : CPLDebug("OpenCL", "No implementation found");
424 0 : return nullptr;
425 : }
426 :
427 0 : const device_info &device{openCLdevices.front()};
428 :
429 0 : CPLDebug("OpenCL",
430 : "Connected to vendor='%s' / device='%s' (%s implementation).",
431 : device.vendor_name.c_str(), device.device_name.c_str(),
432 0 : (device.device_type == CL_DEVICE_TYPE_GPU) ? "GPU" : "CPU");
433 :
434 0 : if (STARTS_WITH(device.vendor_name.c_str(), "Advanced Micro Devices"))
435 0 : *peVendor = VENDOR_AMD;
436 0 : else if (STARTS_WITH(device.vendor_name.c_str(), "Intel"))
437 0 : *peVendor = VENDOR_INTEL;
438 : else
439 0 : *peVendor = VENDOR_OTHER;
440 :
441 0 : return device.id;
442 : }
443 :
444 : /*
445 : Given that not all OpenCL devices support the same image formats, we need to
446 : make do with what we have. This leads to wasted space, but as OpenCL matures
447 : I hope it'll get better.
448 : */
449 0 : static cl_int set_supported_formats(struct oclWarper *warper,
450 : cl_channel_order minOrderSize,
451 : cl_channel_order *chosenOrder,
452 : unsigned int *chosenSize,
453 : cl_channel_type dataType)
454 : {
455 : cl_image_format *fmtBuf =
456 0 : static_cast<cl_image_format *>(calloc(256, sizeof(cl_image_format)));
457 : cl_uint numRet;
458 : cl_uint i;
459 0 : cl_uint extraSpace = 9999;
460 : cl_int err;
461 0 : int bFound = FALSE;
462 :
463 : // Find what we *can* handle
464 0 : handleErr(err = clGetSupportedImageFormats(
465 : warper->context, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE2D, 256,
466 : fmtBuf, &numRet));
467 0 : for (i = 0; i < numRet; ++i)
468 : {
469 0 : cl_channel_order thisOrderSize = 0;
470 0 : switch (fmtBuf[i].image_channel_order)
471 : {
472 : // Only support formats which use the channels in order
473 : // (x,y,z,w)
474 0 : case CL_R:
475 : case CL_INTENSITY:
476 : case CL_LUMINANCE:
477 0 : thisOrderSize = 1;
478 0 : break;
479 0 : case CL_RG:
480 0 : thisOrderSize = 2;
481 0 : break;
482 0 : case CL_RGB:
483 0 : thisOrderSize = 3;
484 0 : break;
485 0 : case CL_RGBA:
486 0 : thisOrderSize = 4;
487 0 : break;
488 : }
489 :
490 : // Choose an order with the least wasted space
491 0 : if (fmtBuf[i].image_channel_data_type == dataType &&
492 0 : minOrderSize <= thisOrderSize &&
493 0 : extraSpace > thisOrderSize - minOrderSize)
494 : {
495 :
496 : // Set the vector size, order, & remember wasted space
497 0 : (*chosenSize) = thisOrderSize;
498 0 : (*chosenOrder) = fmtBuf[i].image_channel_order;
499 0 : extraSpace = thisOrderSize - minOrderSize;
500 0 : bFound = TRUE;
501 : }
502 : }
503 :
504 0 : free(fmtBuf);
505 :
506 0 : if (!bFound)
507 : {
508 0 : CPLDebug("OpenCL",
509 : "Cannot find supported format for dataType = %s and "
510 : "minOrderSize = %d",
511 : getCLDataTypeString(dataType), static_cast<int>(minOrderSize));
512 : }
513 0 : return (bFound) ? CL_SUCCESS : CL_INVALID_OPERATION;
514 : }
515 :
516 : /*
517 : Allocate some pinned memory that we can use as an intermediate buffer. We're
518 : using the pinned memory to assemble the data before transferring it to the
519 : device. The reason we're using pinned RAM is because the transfer speed from
520 : host RAM to device RAM is faster than non-pinned. The disadvantage is that
521 : pinned RAM is a scarce OS resource. I'm making the assumption that the user
522 : has as much pinned host RAM available as total device RAM because device RAM
523 : tends to be similarly scarce. However, if the pinned memory fails we fall back
524 : to using a regular memory allocation.
525 :
526 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
527 : */
528 0 : static cl_int alloc_pinned_mem(struct oclWarper *warper, int imgNum,
529 : size_t dataSz, void **wrkPtr, cl_mem *wrkCL)
530 : {
531 0 : cl_int err = CL_SUCCESS;
532 0 : wrkCL[imgNum] = clCreateBuffer(warper->context,
533 : CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
534 : dataSz, nullptr, &err);
535 :
536 0 : if (err == CL_SUCCESS)
537 : {
538 0 : wrkPtr[imgNum] = clEnqueueMapBuffer(warper->queue, wrkCL[imgNum],
539 : CL_FALSE, CL_MAP_WRITE, 0, dataSz,
540 : 0, nullptr, nullptr, &err);
541 0 : handleErr(err);
542 : }
543 : else
544 : {
545 0 : wrkCL[imgNum] = nullptr;
546 : #ifdef DEBUG_OPENCL
547 : CPLDebug("OpenCL", "Using fallback non-pinned memory!");
548 : #endif
549 : // Fallback to regular allocation
550 0 : wrkPtr[imgNum] = VSI_MALLOC_VERBOSE(dataSz);
551 :
552 0 : if (wrkPtr[imgNum] == nullptr)
553 0 : handleErr(err = CL_OUT_OF_HOST_MEMORY);
554 : }
555 :
556 0 : return CL_SUCCESS;
557 : }
558 :
559 : /*
560 : Allocates the working host memory for all bands of the image in the warper
561 : structure. This includes both the source image buffers and the destination
562 : buffers. This memory is located on the host, so we can assemble the image.
563 : Reasons for buffering it like this include reading each row from disk and
564 : de-interleaving bands and parts of bands. Then they can be copied to the device
565 : as a single operation fit for use as an OpenCL memory object.
566 :
567 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
568 : */
569 0 : static cl_int alloc_working_arr(struct oclWarper *warper, size_t ptrSz,
570 : size_t dataSz, CPL_UNUSED size_t *fmtSz)
571 : {
572 0 : cl_int err = CL_SUCCESS;
573 : int i, b;
574 : size_t srcDataSz1, dstDataSz1, srcDataSz4, dstDataSz4;
575 0 : const int numBands = warper->numBands;
576 :
577 : // Find the best channel order for this format
578 0 : err = set_supported_formats(warper, 1, &(warper->imgChOrder1),
579 : &(warper->imgChSize1), warper->imageFormat);
580 0 : handleErr(err);
581 0 : if (warper->useVec)
582 : {
583 0 : err = set_supported_formats(warper, 4, &(warper->imgChOrder4),
584 : &(warper->imgChSize4), warper->imageFormat);
585 0 : handleErr(err);
586 : }
587 :
588 : // Alloc space for pointers to the main image data
589 0 : warper->realWork.v =
590 0 : static_cast<void **>(VSI_CALLOC_VERBOSE(ptrSz, warper->numImages));
591 0 : warper->dstRealWork.v =
592 0 : static_cast<void **>(VSI_CALLOC_VERBOSE(ptrSz, warper->numImages));
593 0 : if (warper->realWork.v == nullptr || warper->dstRealWork.v == nullptr)
594 0 : handleErr(err = CL_OUT_OF_HOST_MEMORY);
595 :
596 0 : if (warper->imagWorkCL != nullptr)
597 : {
598 : // Alloc space for pointers to the extra channel, if it exists
599 0 : warper->imagWork.v =
600 0 : static_cast<void **>(VSI_CALLOC_VERBOSE(ptrSz, warper->numImages));
601 0 : warper->dstImagWork.v =
602 0 : static_cast<void **>(VSI_CALLOC_VERBOSE(ptrSz, warper->numImages));
603 0 : if (warper->imagWork.v == nullptr || warper->dstImagWork.v == nullptr)
604 0 : handleErr(err = CL_OUT_OF_HOST_MEMORY);
605 : }
606 : else
607 : {
608 0 : warper->imagWork.v = nullptr;
609 0 : warper->dstImagWork.v = nullptr;
610 : }
611 :
612 : // Calc the sizes we need
613 0 : srcDataSz1 =
614 0 : dataSz * warper->srcWidth * warper->srcHeight * warper->imgChSize1;
615 0 : dstDataSz1 = dataSz * warper->dstWidth * warper->dstHeight;
616 0 : srcDataSz4 =
617 0 : dataSz * warper->srcWidth * warper->srcHeight * warper->imgChSize4;
618 0 : dstDataSz4 = dataSz * warper->dstWidth * warper->dstHeight * 4;
619 :
620 : // Allocate pinned memory for each band's image
621 0 : for (b = 0, i = 0; b < numBands && i < warper->numImages; ++i)
622 : {
623 0 : if (warper->useVec && b < numBands - numBands % 4)
624 : {
625 0 : handleErr(err = alloc_pinned_mem(warper, i, srcDataSz4,
626 : warper->realWork.v,
627 : warper->realWorkCL));
628 :
629 0 : handleErr(err = alloc_pinned_mem(warper, i, dstDataSz4,
630 : warper->dstRealWork.v,
631 : warper->dstRealWorkCL));
632 0 : b += 4;
633 : }
634 : else
635 : {
636 0 : handleErr(err = alloc_pinned_mem(warper, i, srcDataSz1,
637 : warper->realWork.v,
638 : warper->realWorkCL));
639 :
640 0 : handleErr(err = alloc_pinned_mem(warper, i, dstDataSz1,
641 : warper->dstRealWork.v,
642 : warper->dstRealWorkCL));
643 0 : ++b;
644 : }
645 : }
646 :
647 0 : if (warper->imagWorkCL != nullptr)
648 : {
649 : // Allocate pinned memory for each band's extra channel, if exists
650 0 : for (b = 0, i = 0; b < numBands && i < warper->numImages; ++i)
651 : {
652 0 : if (warper->useVec && b < numBands - numBands % 4)
653 : {
654 0 : handleErr(err = alloc_pinned_mem(warper, i, srcDataSz4,
655 : warper->imagWork.v,
656 : warper->imagWorkCL));
657 :
658 0 : handleErr(err = alloc_pinned_mem(warper, i, dstDataSz4,
659 : warper->dstImagWork.v,
660 : warper->dstImagWorkCL));
661 0 : b += 4;
662 : }
663 : else
664 : {
665 0 : handleErr(err = alloc_pinned_mem(warper, i, srcDataSz1,
666 : warper->imagWork.v,
667 : warper->imagWorkCL));
668 :
669 0 : handleErr(err = alloc_pinned_mem(warper, i, dstDataSz1,
670 : warper->dstImagWork.v,
671 : warper->dstImagWorkCL));
672 0 : ++b;
673 : }
674 : }
675 : }
676 :
677 0 : return CL_SUCCESS;
678 : }
679 :
680 : /*
681 : Assemble and create the kernel. For optimization, portability, and
682 : implementation limitation reasons, the program is actually assembled from
683 : several strings, then compiled with as many invariants as possible defined by
684 : the preprocessor. There is also quite a bit of error-catching code in here
685 : because the kernel is where many bugs show up.
686 :
687 : Returns CL_SUCCESS on success and other CL_* errors in the error buffer when
688 : something goes wrong.
689 : */
690 0 : static cl_kernel get_kernel(struct oclWarper *warper, char useVec,
691 : double dfXScale, double dfYScale, double dfXFilter,
692 : double dfYFilter, int nXRadius, int nYRadius,
693 : int nFiltInitX, int nFiltInitY, cl_int *clErr)
694 : {
695 : cl_program program;
696 : cl_kernel kernel;
697 0 : cl_int err = CL_SUCCESS;
698 0 : constexpr int PROGBUF_SIZE = 128000;
699 0 : std::string buffer;
700 0 : buffer.resize(PROGBUF_SIZE);
701 0 : std::string progBuf;
702 0 : progBuf.resize(PROGBUF_SIZE);
703 0 : float dstMinVal = 0.f, dstMaxVal = 0.0;
704 :
705 0 : const char *outType = "";
706 0 : const char *dUseVec = "";
707 0 : const char *dVecf = "float";
708 0 : const char *kernGenFuncs = R""""(
709 : // ********************* General Funcs ********************
710 : void clampToDst(float fReal,
711 : __global outType *dstPtr,
712 : unsigned int iDstOffset,
713 : __constant float *fDstNoDataReal,
714 : int bandNum);
715 : void setPixel(__global outType *dstReal,
716 : __global outType *dstImag,
717 : __global float *dstDensity,
718 : __global int *nDstValid,
719 : __constant float *fDstNoDataReal,
720 : const int bandNum,
721 : vecf fDensity, vecf fReal, vecf fImag);
722 : int getPixel(__read_only image2d_t srcReal,
723 : __read_only image2d_t srcImag,
724 : __global float *fUnifiedSrcDensity,
725 : __global int *nUnifiedSrcValid,
726 : __constant char *useBandSrcValid,
727 : __global int *nBandSrcValid,
728 : const int2 iSrc,
729 : int bandNum,
730 : vecf *fDensity, vecf *fReal, vecf *fImag);
731 : int isValid(__global float *fUnifiedSrcDensity,
732 : __global int *nUnifiedSrcValid,
733 : float2 fSrcCoords );
734 : float2 getSrcCoords(__read_only image2d_t srcCoords);
735 :
736 : #ifdef USE_CLAMP_TO_DST_FLOAT
737 : void clampToDst(float fReal,
738 : __global outType *dstPtr,
739 : unsigned int iDstOffset,
740 : __constant float *fDstNoDataReal,
741 : int bandNum)
742 : {
743 : dstPtr[iDstOffset] = fReal;
744 : }
745 : #else
746 : void clampToDst(float fReal,
747 : __global outType *dstPtr,
748 : unsigned int iDstOffset,
749 : __constant float *fDstNoDataReal,
750 : int bandNum)
751 : {
752 : fReal *= dstMaxVal;
753 :
754 : if (fReal < dstMinVal)
755 : dstPtr[iDstOffset] = (outType)dstMinVal;
756 : else if (fReal > dstMaxVal)
757 : dstPtr[iDstOffset] = (outType)dstMaxVal;
758 : else
759 : dstPtr[iDstOffset] = (dstMinVal < 0) ? (outType)floor(fReal + 0.5f) : (outType)(fReal + 0.5f);
760 :
761 : if (useDstNoDataReal && bandNum >= 0 &&
762 : fDstNoDataReal[bandNum] == dstPtr[iDstOffset])
763 : {
764 : if (dstPtr[iDstOffset] == dstMinVal)
765 : dstPtr[iDstOffset] = dstMinVal + 1;
766 : else
767 : dstPtr[iDstOffset] = dstPtr[iDstOffset] - 1;
768 : }
769 : }
770 : #endif
771 :
772 : void setPixel(__global outType *dstReal,
773 : __global outType *dstImag,
774 : __global float *dstDensity,
775 : __global int *nDstValid,
776 : __constant float *fDstNoDataReal,
777 : const int bandNum,
778 : vecf fDensity, vecf fReal, vecf fImag)
779 : {
780 : unsigned int iDstOffset = get_global_id(1)*iDstWidth + get_global_id(0);
781 :
782 : #ifdef USE_VEC
783 : if (fDensity.x < 0.00001f && fDensity.y < 0.00001f &&
784 : fDensity.z < 0.00001f && fDensity.w < 0.00001f ) {
785 :
786 : fReal = 0.0f;
787 : fImag = 0.0f;
788 :
789 : } else if (fDensity.x < 0.9999f || fDensity.y < 0.9999f ||
790 : fDensity.z < 0.9999f || fDensity.w < 0.9999f ) {
791 : vecf fDstReal, fDstImag;
792 : float fDstDensity;
793 :
794 : fDstReal.x = dstReal[iDstOffset];
795 : fDstReal.y = dstReal[iDstOffset+iDstHeight*iDstWidth];
796 : fDstReal.z = dstReal[iDstOffset+iDstHeight*iDstWidth*2];
797 : fDstReal.w = dstReal[iDstOffset+iDstHeight*iDstWidth*3];
798 : if (useImag) {
799 : fDstImag.x = dstImag[iDstOffset];
800 : fDstImag.y = dstImag[iDstOffset+iDstHeight*iDstWidth];
801 : fDstImag.z = dstImag[iDstOffset+iDstHeight*iDstWidth*2];
802 : fDstImag.w = dstImag[iDstOffset+iDstHeight*iDstWidth*3];
803 : }
804 : #else
805 : if (fDensity < 0.00001f) {
806 :
807 : fReal = 0.0f;
808 : fImag = 0.0f;
809 :
810 : } else if (fDensity < 0.9999f) {
811 : vecf fDstReal, fDstImag;
812 : float fDstDensity;
813 :
814 : fDstReal = dstReal[iDstOffset];
815 : if (useImag)
816 : fDstImag = dstImag[iDstOffset];
817 : #endif
818 :
819 : if (useDstDensity)
820 : fDstDensity = dstDensity[iDstOffset];
821 : else if (useDstValid &&
822 : !((nDstValid[iDstOffset>>5] & (0x01 << (iDstOffset & 0x1f))) ))
823 : fDstDensity = 0.0f;
824 : else
825 : fDstDensity = 1.0f;
826 :
827 : vecf fDstInfluence = (1.0f - fDensity) * fDstDensity;
828 :
829 : // Density should be checked for <= 0.0 & handled by the calling function
830 : fReal = (fReal * fDensity + fDstReal * fDstInfluence) / (fDensity + fDstInfluence);
831 : if (useImag)
832 : fImag = (fImag * fDensity + fDstImag * fDstInfluence) / (fDensity + fDstInfluence);
833 : }
834 :
835 : #ifdef USE_VEC
836 : clampToDst(fReal.x, dstReal, iDstOffset, fDstNoDataReal, bandNum);
837 : clampToDst(fReal.y, dstReal, iDstOffset+iDstHeight*iDstWidth, fDstNoDataReal, bandNum);
838 : clampToDst(fReal.z, dstReal, iDstOffset+iDstHeight*iDstWidth*2, fDstNoDataReal, bandNum);
839 : clampToDst(fReal.w, dstReal, iDstOffset+iDstHeight*iDstWidth*3, fDstNoDataReal, bandNum);
840 : if (useImag) {
841 : clampToDst(fImag.x, dstImag, iDstOffset, fDstNoDataReal, -1);
842 : clampToDst(fImag.y, dstImag, iDstOffset+iDstHeight*iDstWidth, fDstNoDataReal, -1);
843 : clampToDst(fImag.z, dstImag, iDstOffset+iDstHeight*iDstWidth*2, fDstNoDataReal, -1);
844 : clampToDst(fImag.w, dstImag, iDstOffset+iDstHeight*iDstWidth*3, fDstNoDataReal, -1);
845 : }
846 : #else
847 : clampToDst(fReal, dstReal, iDstOffset, fDstNoDataReal, bandNum);
848 : if (useImag)
849 : clampToDst(fImag, dstImag, iDstOffset, fDstNoDataReal, -1);
850 : #endif
851 : }
852 :
853 : int getPixel(__read_only image2d_t srcReal,
854 : __read_only image2d_t srcImag,
855 : __global float *fUnifiedSrcDensity,
856 : __global int *nUnifiedSrcValid,
857 : __constant char *useBandSrcValid,
858 : __global int *nBandSrcValid,
859 : const int2 iSrc,
860 : int bandNum,
861 : vecf *fDensity, vecf *fReal, vecf *fImag)
862 : {
863 : int iSrcOffset = 0, iBandValidLen = 0, iSrcOffsetMask = 0;
864 : int bHasValid = FALSE;
865 :
866 : // Clamp the src offset values if needed
867 : if(useUnifiedSrcDensity | useUnifiedSrcValid | useUseBandSrcValid){
868 : int iSrcX = iSrc.x;
869 : int iSrcY = iSrc.y;
870 :
871 : // Needed because the offset isn't clamped in OpenCL hardware
872 : if(iSrcX < 0)
873 : iSrcX = 0;
874 : else if(iSrcX >= iSrcWidth)
875 : iSrcX = iSrcWidth - 1;
876 :
877 : if(iSrcY < 0)
878 : iSrcY = 0;
879 : else if(iSrcY >= iSrcHeight)
880 : iSrcY = iSrcHeight - 1;
881 :
882 : iSrcOffset = iSrcY*iSrcWidth + iSrcX;
883 : iBandValidLen = 1 + ((iSrcWidth*iSrcHeight)>>5);
884 : iSrcOffsetMask = (0x01 << (iSrcOffset & 0x1f));
885 : }
886 :
887 : if (useUnifiedSrcValid &&
888 : !((nUnifiedSrcValid[iSrcOffset>>5] & iSrcOffsetMask) ) )
889 : return FALSE;
890 :
891 : #ifdef USE_VEC
892 : if (!useUseBandSrcValid || !useBandSrcValid[bandNum] ||
893 : ((nBandSrcValid[(iSrcOffset>>5)+iBandValidLen*bandNum ] & iSrcOffsetMask)) )
894 : bHasValid = TRUE;
895 :
896 : if (!useUseBandSrcValid || !useBandSrcValid[bandNum+1] ||
897 : ((nBandSrcValid[(iSrcOffset>>5)+iBandValidLen*(1+bandNum)] & iSrcOffsetMask)) )
898 : bHasValid = TRUE;
899 :
900 : if (!useUseBandSrcValid || !useBandSrcValid[bandNum+2] ||
901 : ((nBandSrcValid[(iSrcOffset>>5)+iBandValidLen*(2+bandNum)] & iSrcOffsetMask)) )
902 : bHasValid = TRUE;
903 :
904 : if (!useUseBandSrcValid || !useBandSrcValid[bandNum+3] ||
905 : ((nBandSrcValid[(iSrcOffset>>5)+iBandValidLen*(3+bandNum)] & iSrcOffsetMask)) )
906 : bHasValid = TRUE;
907 : #else
908 : if (!useUseBandSrcValid || !useBandSrcValid[bandNum] ||
909 : ((nBandSrcValid[(iSrcOffset>>5)+iBandValidLen*bandNum ] & iSrcOffsetMask)) )
910 : bHasValid = TRUE;
911 : #endif
912 :
913 : if (!bHasValid)
914 : return FALSE;
915 :
916 : const sampler_t samp = CLK_NORMALIZED_COORDS_FALSE |
917 : CLK_ADDRESS_CLAMP_TO_EDGE |
918 : CLK_FILTER_NEAREST;
919 :
920 : #ifdef USE_VEC
921 : (*fReal) = read_imagef(srcReal, samp, iSrc);
922 : if (useImag)
923 : (*fImag) = read_imagef(srcImag, samp, iSrc);
924 : #else
925 : (*fReal) = read_imagef(srcReal, samp, iSrc).x;
926 : if (useImag)
927 : (*fImag) = read_imagef(srcImag, samp, iSrc).x;
928 : #endif
929 :
930 : if (useUnifiedSrcDensity) {
931 : (*fDensity) = fUnifiedSrcDensity[iSrcOffset];
932 : } else {
933 : (*fDensity) = 1.0f;
934 : return TRUE;
935 : }
936 :
937 : #ifdef USE_VEC
938 : return (*fDensity).x > 0.0000001f || (*fDensity).y > 0.0000001f ||
939 : (*fDensity).z > 0.0000001f || (*fDensity).w > 0.0000001f;
940 : #else
941 : return (*fDensity) > 0.0000001f;
942 : #endif
943 : }
944 :
945 : int isValid(__global float *fUnifiedSrcDensity,
946 : __global int *nUnifiedSrcValid,
947 : float2 fSrcCoords )
948 : {
949 : if (fSrcCoords.x < 0.0f || fSrcCoords.y < 0.0f)
950 : return FALSE;
951 :
952 : int iSrcX = (int) (fSrcCoords.x - 0.5f);
953 : int iSrcY = (int) (fSrcCoords.y - 0.5f);
954 :
955 : if( iSrcX < 0 || iSrcX >= iSrcWidth || iSrcY < 0 || iSrcY >= iSrcHeight )
956 : return FALSE;
957 :
958 : int iSrcOffset = iSrcX + iSrcY * iSrcWidth;
959 :
960 : if (useUnifiedSrcDensity && fUnifiedSrcDensity[iSrcOffset] < 0.00001f)
961 : return FALSE;
962 :
963 : if (useUnifiedSrcValid &&
964 : !(nUnifiedSrcValid[iSrcOffset>>5] & (0x01 << (iSrcOffset & 0x1f))) )
965 : return FALSE;
966 :
967 : return TRUE;
968 : }
969 :
970 : float2 getSrcCoords(__read_only image2d_t srcCoords)
971 : {
972 : // Find an appropriate place to sample the coordinates so we're still
973 : // accurate after linear interpolation.
974 : int nDstX = get_global_id(0);
975 : int nDstY = get_global_id(1);
976 : float2 fDst = (float2)((0.5f * (float)iCoordMult + nDstX) /
977 : (float)((ceil((iDstWidth - 1) / (float)iCoordMult) + 1) * iCoordMult),
978 : (0.5f * (float)iCoordMult + nDstY) /
979 : (float)((ceil((iDstHeight - 1) / (float)iCoordMult) + 1) * iCoordMult));
980 :
981 : // Check & return when the thread group overruns the image size
982 : if (nDstX >= iDstWidth || nDstY >= iDstHeight)
983 : return (float2)(-99.0f, -99.0f);
984 :
985 : const sampler_t samp = CLK_NORMALIZED_COORDS_TRUE |
986 : CLK_ADDRESS_CLAMP_TO_EDGE |
987 : CLK_FILTER_LINEAR;
988 :
989 : float4 fSrcCoords = read_imagef(srcCoords,samp,fDst);
990 :
991 : return (float2)(fSrcCoords.x, fSrcCoords.y);
992 : }
993 : )"""";
994 :
995 0 : const char *kernBilinear = R""""(
996 : // ************************ Bilinear ************************
997 : __kernel void resamp(__read_only image2d_t srcCoords,
998 : __read_only image2d_t srcReal,
999 : __read_only image2d_t srcImag,
1000 : __global float *fUnifiedSrcDensity,
1001 : __global int *nUnifiedSrcValid,
1002 : __constant char *useBandSrcValid,
1003 : __global int *nBandSrcValid,
1004 : __global outType *dstReal,
1005 : __global outType *dstImag,
1006 : __constant float *fDstNoDataReal,
1007 : __global float *dstDensity,
1008 : __global int *nDstValid,
1009 : const int bandNum)
1010 : {
1011 : float2 fSrc = getSrcCoords(srcCoords);
1012 : if (!isValid(fUnifiedSrcDensity, nUnifiedSrcValid, fSrc))
1013 : return;
1014 :
1015 : int iSrcX = (int) floor(fSrc.x - 0.5f);
1016 : int iSrcY = (int) floor(fSrc.y - 0.5f);
1017 : float fRatioX = 1.5f - (fSrc.x - iSrcX);
1018 : float fRatioY = 1.5f - (fSrc.y - iSrcY);
1019 : vecf fReal, fImag, fDens;
1020 : vecf fAccumulatorReal = 0.0f, fAccumulatorImag = 0.0f;
1021 : vecf fAccumulatorDensity = 0.0f;
1022 : float fAccumulatorDivisor = 0.0f;
1023 :
1024 : if ( iSrcY >= 0 && iSrcY < iSrcHeight ) {
1025 : float fMult1 = fRatioX * fRatioY;
1026 : float fMult2 = (1.0f-fRatioX) * fRatioY;
1027 :
1028 : // Upper Left Pixel
1029 : if ( iSrcX >= 0 && iSrcX < iSrcWidth
1030 : && getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1031 : useBandSrcValid, nBandSrcValid, (int2)(iSrcX, iSrcY),
1032 : bandNum, &fDens, &fReal, &fImag))
1033 : {
1034 : fAccumulatorDivisor += fMult1;
1035 : fAccumulatorReal += fReal * fMult1;
1036 : fAccumulatorImag += fImag * fMult1;
1037 : fAccumulatorDensity += fDens * fMult1;
1038 : }
1039 :
1040 : // Upper Right Pixel
1041 : if ( iSrcX+1 >= 0 && iSrcX+1 < iSrcWidth
1042 : && getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1043 : useBandSrcValid, nBandSrcValid, (int2)(iSrcX+1, iSrcY),
1044 : bandNum, &fDens, &fReal, &fImag))
1045 : {
1046 : fAccumulatorDivisor += fMult2;
1047 : fAccumulatorReal += fReal * fMult2;
1048 : fAccumulatorImag += fImag * fMult2;
1049 : fAccumulatorDensity += fDens * fMult2;
1050 : }
1051 : }
1052 :
1053 : if ( iSrcY+1 >= 0 && iSrcY+1 < iSrcHeight ) {
1054 : float fMult1 = fRatioX * (1.0f-fRatioY);
1055 : float fMult2 = (1.0f-fRatioX) * (1.0f-fRatioY);
1056 :
1057 : // Lower Left Pixel
1058 : if ( iSrcX >= 0 && iSrcX < iSrcWidth
1059 : && getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1060 : useBandSrcValid, nBandSrcValid, (int2)(iSrcX, iSrcY+1),
1061 : bandNum, &fDens, &fReal, &fImag))
1062 : {
1063 : fAccumulatorDivisor += fMult1;
1064 : fAccumulatorReal += fReal * fMult1;
1065 : fAccumulatorImag += fImag * fMult1;
1066 : fAccumulatorDensity += fDens * fMult1;
1067 : }
1068 :
1069 : // Lower Right Pixel
1070 : if ( iSrcX+1 >= 0 && iSrcX+1 < iSrcWidth
1071 : && getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1072 : useBandSrcValid, nBandSrcValid, (int2)(iSrcX+1, iSrcY+1),
1073 : bandNum, &fDens, &fReal, &fImag))
1074 : {
1075 : fAccumulatorDivisor += fMult2;
1076 : fAccumulatorReal += fReal * fMult2;
1077 : fAccumulatorImag += fImag * fMult2;
1078 : fAccumulatorDensity += fDens * fMult2;
1079 : }
1080 : }
1081 :
1082 : // Compute and save final pixel
1083 : if ( fAccumulatorDivisor < 0.00001f ) {
1084 : setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
1085 : 0.0f, 0.0f, 0.0f );
1086 : } else if ( fAccumulatorDivisor < 0.99999f || fAccumulatorDivisor > 1.00001f ) {
1087 : setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
1088 : fAccumulatorDensity / fAccumulatorDivisor,
1089 : fAccumulatorReal / fAccumulatorDivisor,
1090 : #if useImag != 0
1091 : fAccumulatorImag / fAccumulatorDivisor );
1092 : #else
1093 : 0.0f );
1094 : #endif
1095 : } else {
1096 : setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
1097 : fAccumulatorDensity, fAccumulatorReal, fAccumulatorImag );
1098 : }
1099 : }
1100 : )"""";
1101 :
1102 0 : const char *kernCubic = R""""(
1103 : // ************************ Cubic ************************
1104 : vecf cubicConvolution(float dist1, float dist2, float dist3,
1105 : vecf f0, vecf f1, vecf f2, vecf f3);
1106 :
1107 : vecf cubicConvolution(float dist1, float dist2, float dist3,
1108 : vecf f0, vecf f1, vecf f2, vecf f3)
1109 : {
1110 : return ( f1
1111 : + 0.5f * (dist1*(f2 - f0)
1112 : + dist2*(2.0f*f0 - 5.0f*f1 + 4.0f*f2 - f3)
1113 : + dist3*(3.0f*(f1 - f2) + f3 - f0)));
1114 : }
1115 :
1116 : // ************************ Cubic ************************
1117 : __kernel void resamp(__read_only image2d_t srcCoords,
1118 : __read_only image2d_t srcReal,
1119 : __read_only image2d_t srcImag,
1120 : __global float *fUnifiedSrcDensity,
1121 : __global int *nUnifiedSrcValid,
1122 : __constant char *useBandSrcValid,
1123 : __global int *nBandSrcValid,
1124 : __global outType *dstReal,
1125 : __global outType *dstImag,
1126 : __constant float *fDstNoDataReal,
1127 : __global float *dstDensity,
1128 : __global int *nDstValid,
1129 : const int bandNum)
1130 : {
1131 : int i;
1132 : float2 fSrc = getSrcCoords(srcCoords);
1133 :
1134 : if (!isValid(fUnifiedSrcDensity, nUnifiedSrcValid, fSrc))
1135 : return;
1136 :
1137 : int iSrcX = (int) floor( fSrc.x - 0.5f );
1138 : int iSrcY = (int) floor( fSrc.y - 0.5f );
1139 : float fDeltaX = fSrc.x - 0.5f - (float)iSrcX;
1140 : float fDeltaY = fSrc.y - 0.5f - (float)iSrcY;
1141 : float fDeltaX2 = fDeltaX * fDeltaX;
1142 : float fDeltaY2 = fDeltaY * fDeltaY;
1143 : float fDeltaX3 = fDeltaX2 * fDeltaX;
1144 : float fDeltaY3 = fDeltaY2 * fDeltaY;
1145 : vecf afReal[4], afDens[4];
1146 : #if useImag != 0
1147 : vecf afImag[4];
1148 : #else
1149 : vecf fImag = 0.0f;
1150 : #endif
1151 :
1152 : // Loop over rows
1153 : for (i = -1; i < 3; ++i)
1154 : {
1155 : vecf fReal1 = 0.0f, fReal2 = 0.0f, fReal3 = 0.0f, fReal4 = 0.0f;
1156 : vecf fDens1 = 0.0f, fDens2 = 0.0f, fDens3 = 0.0f, fDens4 = 0.0f;
1157 : int hasPx;
1158 : #if useImag != 0
1159 : vecf fImag1 = 0.0f, fImag2 = 0.0f, fImag3 = 0.0f, fImag4 = 0.0f;
1160 :
1161 : //Get all the pixels for this row
1162 : hasPx = getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1163 : useBandSrcValid, nBandSrcValid, (int2)(iSrcX-1, iSrcY+i),
1164 : bandNum, &fDens1, &fReal1, &fImag1);
1165 :
1166 : hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1167 : useBandSrcValid, nBandSrcValid, (int2)(iSrcX , iSrcY+i),
1168 : bandNum, &fDens2, &fReal2, &fImag2);
1169 :
1170 : hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1171 : useBandSrcValid, nBandSrcValid, (int2)(iSrcX+1, iSrcY+i),
1172 : bandNum, &fDens3, &fReal3, &fImag3);
1173 :
1174 : hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1175 : useBandSrcValid, nBandSrcValid, (int2)(iSrcX+2, iSrcY+i),
1176 : bandNum, &fDens4, &fReal4, &fImag4);
1177 : #else
1178 : //Get all the pixels for this row
1179 : hasPx = getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1180 : useBandSrcValid, nBandSrcValid, (int2)(iSrcX-1, iSrcY+i),
1181 : bandNum, &fDens1, &fReal1, &fImag);
1182 :
1183 : hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1184 : useBandSrcValid, nBandSrcValid, (int2)(iSrcX , iSrcY+i),
1185 : bandNum, &fDens2, &fReal2, &fImag);
1186 :
1187 : hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1188 : useBandSrcValid, nBandSrcValid, (int2)(iSrcX+1, iSrcY+i),
1189 : bandNum, &fDens3, &fReal3, &fImag);
1190 :
1191 : hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1192 : useBandSrcValid, nBandSrcValid, (int2)(iSrcX+2, iSrcY+i),
1193 : bandNum, &fDens4, &fReal4, &fImag);
1194 : #endif
1195 :
1196 : // Shortcut if no px
1197 : if (!hasPx) {
1198 : afDens[i+1] = 0.0f;
1199 : afReal[i+1] = 0.0f;
1200 : #if useImag != 0
1201 : afImag[i+1] = 0.0f;
1202 : #endif
1203 : continue;
1204 : }
1205 :
1206 : // Process this row
1207 : afDens[i+1] = cubicConvolution(fDeltaX, fDeltaX2, fDeltaX3, fDens1, fDens2, fDens3, fDens4);
1208 : afReal[i+1] = cubicConvolution(fDeltaX, fDeltaX2, fDeltaX3, fReal1, fReal2, fReal3, fReal4);
1209 : #if useImag != 0
1210 : afImag[i+1] = cubicConvolution(fDeltaX, fDeltaX2, fDeltaX3, fImag1, fImag2, fImag3, fImag4);
1211 : #endif
1212 : }
1213 :
1214 : // Compute and save final pixel
1215 : setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
1216 : cubicConvolution(fDeltaY, fDeltaY2, fDeltaY3, afDens[0], afDens[1], afDens[2], afDens[3]),
1217 : cubicConvolution(fDeltaY, fDeltaY2, fDeltaY3, afReal[0], afReal[1], afReal[2], afReal[3]),
1218 : #if useImag != 0
1219 : cubicConvolution(fDeltaY, fDeltaY2, fDeltaY3, afImag[0], afImag[1], afImag[2], afImag[3]) );
1220 : #else
1221 : fImag );
1222 : #endif
1223 : }
1224 : )"""";
1225 :
1226 0 : const char *kernResampler = R""""(
1227 : // ************************ LanczosSinc ************************
1228 :
1229 : float lanczosSinc( float fX, float fR );
1230 : float bSpline( float x );
1231 :
1232 : float lanczosSinc( float fX, float fR )
1233 : {
1234 : if ( fX > fR || fX < -fR)
1235 : return 0.0f;
1236 : if ( fX == 0.0f )
1237 : return 1.0f;
1238 :
1239 : float fPIX = PI * fX;
1240 : return ( sin(fPIX) / fPIX ) * ( sin(fPIX / fR) * fR / fPIX );
1241 : }
1242 :
1243 : // ************************ Bicubic Spline ************************
1244 :
1245 : float bSpline( float x )
1246 : {
1247 : float xp2 = x + 2.0f;
1248 : float xp1 = x + 1.0f;
1249 : float xm1 = x - 1.0f;
1250 : float xp2c = xp2 * xp2 * xp2;
1251 :
1252 : return (((xp2 > 0.0f)?((xp1 > 0.0f)?((x > 0.0f)?((xm1 > 0.0f)?
1253 : -4.0f * xm1*xm1*xm1:0.0f) +
1254 : 6.0f * x*x*x:0.0f) +
1255 : -4.0f * xp1*xp1*xp1:0.0f) +
1256 : xp2c:0.0f) ) * 0.166666666666666666666f;
1257 : }
1258 :
1259 : // ************************ General Resampler ************************
1260 :
1261 : __kernel void resamp(__read_only image2d_t srcCoords,
1262 : __read_only image2d_t srcReal,
1263 : __read_only image2d_t srcImag,
1264 : __global float *fUnifiedSrcDensity,
1265 : __global int *nUnifiedSrcValid,
1266 : __constant char *useBandSrcValid,
1267 : __global int *nBandSrcValid,
1268 : __global outType *dstReal,
1269 : __global outType *dstImag,
1270 : __constant float *fDstNoDataReal,
1271 : __global float *dstDensity,
1272 : __global int *nDstValid,
1273 : const int bandNum)
1274 : {
1275 : float2 fSrc = getSrcCoords(srcCoords);
1276 :
1277 : if (!isValid(fUnifiedSrcDensity, nUnifiedSrcValid, fSrc))
1278 : return;
1279 :
1280 : int iSrcX = floor( fSrc.x - 0.5f );
1281 : int iSrcY = floor( fSrc.y - 0.5f );
1282 : float fDeltaX = fSrc.x - 0.5f - (float)iSrcX;
1283 : float fDeltaY = fSrc.y - 0.5f - (float)iSrcY;
1284 :
1285 : vecf fAccumulatorReal = 0.0f, fAccumulatorImag = 0.0f;
1286 : vecf fAccumulatorDensity = 0.0f;
1287 : float fAccumulatorWeight = 0.0f;
1288 : int i, j;
1289 :
1290 : // Loop over pixel rows in the kernel
1291 : for ( j = nFiltInitY; j <= nYRadius; ++j )
1292 : {
1293 : float fWeight1;
1294 : int2 iSrc = (int2)(0, iSrcY + j);
1295 :
1296 : // Skip sampling over edge of image
1297 : if ( iSrc.y < 0 || iSrc.y >= iSrcHeight )
1298 : continue;
1299 :
1300 : // Select the resampling algorithm
1301 : if ( doCubicSpline )
1302 : // Calculate the Y weight
1303 : fWeight1 = ( fYScale < 1.0f ) ?
1304 : bSpline(((float)j) * fYScale) * fYScale :
1305 : bSpline(((float)j) - fDeltaY);
1306 : else
1307 : fWeight1 = ( fYScale < 1.0f ) ?
1308 : lanczosSinc(j * fYScale, fYFilter) * fYScale :
1309 : lanczosSinc(j - fDeltaY, fYFilter);
1310 :
1311 : // Iterate over pixels in row
1312 : for ( i = nFiltInitX; i <= nXRadius; ++i )
1313 : {
1314 : float fWeight2;
1315 : vecf fDensity = 0.0f, fReal, fImag;
1316 : iSrc.x = iSrcX + i;
1317 :
1318 : // Skip sampling at edge of image
1319 : // Skip sampling when invalid pixel
1320 : if ( iSrc.x < 0 || iSrc.x >= iSrcWidth ||
1321 : !getPixel(srcReal, srcImag, fUnifiedSrcDensity,
1322 : nUnifiedSrcValid, useBandSrcValid, nBandSrcValid,
1323 : iSrc, bandNum, &fDensity, &fReal, &fImag) )
1324 : continue;
1325 :
1326 : // Choose among possible algorithms
1327 : if ( doCubicSpline )
1328 : // Calculate & save the X weight
1329 : fWeight2 = fWeight1 * ((fXScale < 1.0f ) ?
1330 : bSpline((float)i * fXScale) * fXScale :
1331 : bSpline(fDeltaX - (float)i));
1332 : else
1333 : // Calculate & save the X weight
1334 : fWeight2 = fWeight1 * ((fXScale < 1.0f ) ?
1335 : lanczosSinc(i * fXScale, fXFilter) * fXScale :
1336 : lanczosSinc(i - fDeltaX, fXFilter));
1337 :
1338 : // Accumulate!
1339 : fAccumulatorReal += fReal * fWeight2;
1340 : fAccumulatorImag += fImag * fWeight2;
1341 : fAccumulatorDensity += fDensity * fWeight2;
1342 : fAccumulatorWeight += fWeight2;
1343 : }
1344 : }
1345 :
1346 : if ( fAccumulatorWeight < 0.000001f ) {
1347 : setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
1348 : 0.0f, 0.0f, 0.0f);
1349 : } else if ( fAccumulatorWeight < 0.99999f || fAccumulatorWeight > 1.00001f ) {
1350 : // Calculate the output taking into account weighting
1351 : setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
1352 : fAccumulatorDensity / fAccumulatorWeight,
1353 : fAccumulatorReal / fAccumulatorWeight,
1354 : #if useImag != 0
1355 : fAccumulatorImag / fAccumulatorWeight );
1356 : #else
1357 : 0.0f );
1358 : #endif
1359 : } else {
1360 : setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
1361 : fAccumulatorDensity, fAccumulatorReal, fAccumulatorImag);
1362 : }
1363 : }
1364 : )"""";
1365 :
1366 : // Defines based on image format
1367 0 : switch (warper->imageFormat)
1368 : {
1369 0 : case CL_FLOAT:
1370 0 : dstMinVal = std::numeric_limits<float>::lowest();
1371 0 : dstMaxVal = std::numeric_limits<float>::max();
1372 0 : outType = "float";
1373 0 : break;
1374 0 : case CL_SNORM_INT8:
1375 0 : dstMinVal = -128.0;
1376 0 : dstMaxVal = 127.0;
1377 0 : outType = "char";
1378 0 : break;
1379 0 : case CL_UNORM_INT8:
1380 0 : dstMinVal = 0.0;
1381 0 : dstMaxVal = 255.0;
1382 0 : outType = "uchar";
1383 0 : break;
1384 0 : case CL_SNORM_INT16:
1385 0 : dstMinVal = -32768.0;
1386 0 : dstMaxVal = 32767.0;
1387 0 : outType = "short";
1388 0 : break;
1389 0 : case CL_UNORM_INT16:
1390 0 : dstMinVal = 0.0;
1391 0 : dstMaxVal = 65535.0;
1392 0 : outType = "ushort";
1393 0 : break;
1394 0 : default:
1395 0 : CPLError(CE_Failure, CPLE_AppDefined, "Unhandled imageFormat = %d",
1396 : warper->imageFormat);
1397 0 : return nullptr;
1398 : }
1399 :
1400 : // Use vector format?
1401 0 : if (useVec)
1402 : {
1403 0 : dUseVec = "-D USE_VEC";
1404 0 : dVecf = "float4";
1405 : }
1406 :
1407 : // Assemble the kernel from parts. The compiler is unable to handle multiple
1408 : // kernels in one string with more than a few __constant modifiers each.
1409 0 : if (warper->resampAlg == OCL_Bilinear)
1410 0 : snprintf(&progBuf[0], PROGBUF_SIZE, "%s\n%s", kernGenFuncs,
1411 : kernBilinear);
1412 0 : else if (warper->resampAlg == OCL_Cubic)
1413 0 : snprintf(&progBuf[0], PROGBUF_SIZE, "%s\n%s", kernGenFuncs, kernCubic);
1414 : else
1415 0 : snprintf(&progBuf[0], PROGBUF_SIZE, "%s\n%s", kernGenFuncs,
1416 : kernResampler);
1417 :
1418 : // Actually make the program from assembled source
1419 0 : const char *pszProgBuf = progBuf.c_str();
1420 0 : program = clCreateProgramWithSource(warper->context, 1, &pszProgBuf,
1421 : nullptr, &err);
1422 0 : handleErrGoto(err, error_final);
1423 :
1424 : // Assemble the compiler arg string for speed. All invariants should be
1425 : // defined here.
1426 0 : snprintf(
1427 0 : &buffer[0], PROGBUF_SIZE,
1428 : "-cl-fast-relaxed-math -Werror -D FALSE=0 -D TRUE=1 "
1429 : "%s"
1430 : "-D iSrcWidth=%d -D iSrcHeight=%d -D iDstWidth=%d -D iDstHeight=%d "
1431 : "-D useUnifiedSrcDensity=%d -D useUnifiedSrcValid=%d "
1432 : "-D useDstDensity=%d -D useDstValid=%d -D useImag=%d "
1433 : "-D fXScale=%015.15lff -D fYScale=%015.15lff -D fXFilter=%015.15lff -D "
1434 : "fYFilter=%015.15lff "
1435 : "-D nXRadius=%d -D nYRadius=%d -D nFiltInitX=%d -D nFiltInitY=%d "
1436 : "-D PI=%015.15lff -D outType=%s -D dstMinVal=%015.15lff -D "
1437 : "dstMaxVal=%015.15lff "
1438 : "-D useDstNoDataReal=%d -D vecf=%s %s -D doCubicSpline=%d "
1439 : "-D useUseBandSrcValid=%d -D iCoordMult=%d ",
1440 0 : (warper->imageFormat == CL_FLOAT) ? "-D USE_CLAMP_TO_DST_FLOAT=1 " : "",
1441 : warper->srcWidth, warper->srcHeight, warper->dstWidth,
1442 : warper->dstHeight, warper->useUnifiedSrcDensity,
1443 : warper->useUnifiedSrcValid, warper->useDstDensity, warper->useDstValid,
1444 0 : warper->imagWorkCL != nullptr, dfXScale, dfYScale, dfXFilter, dfYFilter,
1445 : nXRadius, nYRadius, nFiltInitX, nFiltInitY, M_PI, outType, dstMinVal,
1446 0 : dstMaxVal, warper->fDstNoDataRealCL != nullptr, dVecf, dUseVec,
1447 0 : warper->resampAlg == OCL_CubicSpline,
1448 0 : warper->nBandSrcValidCL != nullptr, warper->coordMult);
1449 :
1450 0 : (*clErr) = err = clBuildProgram(program, 1, &(warper->dev), buffer.data(),
1451 : nullptr, nullptr);
1452 :
1453 : // Detailed debugging info
1454 0 : if (err != CL_SUCCESS)
1455 : {
1456 0 : const char *pszStatus = "unknown_status";
1457 0 : err = clGetProgramBuildInfo(program, warper->dev, CL_PROGRAM_BUILD_LOG,
1458 0 : PROGBUF_SIZE, &buffer[0], nullptr);
1459 0 : handleErrGoto(err, error_free_program);
1460 :
1461 0 : CPLError(CE_Failure, CPLE_AppDefined,
1462 : "Error: Failed to build program executable!\nBuild Log:\n%s",
1463 : buffer.c_str());
1464 :
1465 0 : err =
1466 0 : clGetProgramBuildInfo(program, warper->dev, CL_PROGRAM_BUILD_STATUS,
1467 0 : PROGBUF_SIZE, &buffer[0], nullptr);
1468 0 : handleErrGoto(err, error_free_program);
1469 :
1470 0 : if (buffer[0] == CL_BUILD_NONE)
1471 0 : pszStatus = "CL_BUILD_NONE";
1472 0 : else if (buffer[0] == CL_BUILD_ERROR)
1473 0 : pszStatus = "CL_BUILD_ERROR";
1474 0 : else if (buffer[0] == CL_BUILD_SUCCESS)
1475 0 : pszStatus = "CL_BUILD_SUCCESS";
1476 0 : else if (buffer[0] == CL_BUILD_IN_PROGRESS)
1477 0 : pszStatus = "CL_BUILD_IN_PROGRESS";
1478 :
1479 0 : CPLDebug("OpenCL", "Build Status: %s\nProgram Source:\n%s", pszStatus,
1480 : progBuf.c_str());
1481 0 : goto error_free_program;
1482 : }
1483 :
1484 0 : kernel = clCreateKernel(program, "resamp", &err);
1485 0 : handleErrGoto(err, error_free_program);
1486 :
1487 0 : err = clReleaseProgram(program);
1488 0 : handleErrGoto(err, error_final);
1489 :
1490 0 : return kernel;
1491 :
1492 0 : error_free_program:
1493 0 : err = clReleaseProgram(program);
1494 :
1495 0 : error_final:
1496 0 : return nullptr;
1497 : }
1498 :
1499 : /*
1500 : Alloc & copy the coordinate data from host working memory to the device. The
1501 : working memory should be a pinned, linear, array of floats. This allows us to
1502 : allocate and copy all data in one step. The pointer to the device memory is
1503 : saved and set as the appropriate argument number.
1504 :
1505 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1506 : */
1507 0 : static cl_int set_coord_data(struct oclWarper *warper, cl_mem *xy)
1508 : {
1509 0 : cl_int err = CL_SUCCESS;
1510 : cl_image_format imgFmt;
1511 :
1512 : // Copy coord data to the device
1513 0 : imgFmt.image_channel_order = warper->xyChOrder;
1514 0 : imgFmt.image_channel_data_type = CL_FLOAT;
1515 :
1516 : #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
1517 : #pragma GCC diagnostic push
1518 : #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
1519 : #endif
1520 0 : (*xy) = clCreateImage2D(warper->context,
1521 : CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &imgFmt,
1522 0 : static_cast<size_t>(warper->xyWidth),
1523 0 : static_cast<size_t>(warper->xyHeight),
1524 0 : sizeof(float) * warper->xyChSize * warper->xyWidth,
1525 0 : warper->xyWork, &err);
1526 : #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
1527 : #pragma GCC diagnostic pop
1528 : #endif
1529 0 : handleErr(err);
1530 :
1531 : // Free the source memory, now that it's copied we don't need it
1532 0 : freeCLMem(warper->xyWorkCL, warper->xyWork);
1533 :
1534 : // Set up argument
1535 0 : if (warper->kern1 != nullptr)
1536 : {
1537 0 : handleErr(err = clSetKernelArg(warper->kern1, 0, sizeof(cl_mem), xy));
1538 : }
1539 0 : if (warper->kern4 != nullptr)
1540 : {
1541 0 : handleErr(err = clSetKernelArg(warper->kern4, 0, sizeof(cl_mem), xy));
1542 : }
1543 :
1544 0 : return CL_SUCCESS;
1545 : }
1546 :
1547 : /*
1548 : Sets the unified density & valid data structures. These are optional structures
1549 : from GDAL, and as such if they are NULL a small placeholder memory segment is
1550 : defined. This is because the spec is unclear on if a NULL value can be passed
1551 : as a kernel argument in place of memory. If it's not NULL, the data is copied
1552 : from the working memory to the device memory. After that, we check if we are
1553 : using the per-band validity mask, and set that as appropriate. At the end, the
1554 : CL mem is passed as the kernel arguments.
1555 :
1556 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1557 : */
1558 : static cl_int
1559 0 : set_unified_data(struct oclWarper *warper, cl_mem *unifiedSrcDensityCL,
1560 : cl_mem *unifiedSrcValidCL, float *unifiedSrcDensity,
1561 : unsigned int *unifiedSrcValid, cl_mem *useBandSrcValidCL,
1562 : cl_mem *nBandSrcValidCL)
1563 : {
1564 0 : cl_int err = CL_SUCCESS;
1565 0 : size_t sz = warper->srcWidth * warper->srcHeight;
1566 0 : int useValid = warper->nBandSrcValidCL != nullptr;
1567 : // 32 bits in the mask
1568 0 : int validSz = static_cast<int>(sizeof(int) * ((31 + sz) >> 5));
1569 :
1570 : // Copy unifiedSrcDensity if it exists
1571 0 : if (unifiedSrcDensity == nullptr)
1572 : {
1573 : // Alloc dummy device RAM
1574 0 : (*unifiedSrcDensityCL) =
1575 0 : clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
1576 0 : handleErr(err);
1577 : }
1578 : else
1579 : {
1580 : // Alloc & copy all density data
1581 0 : (*unifiedSrcDensityCL) = clCreateBuffer(
1582 : warper->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
1583 : sizeof(float) * sz, unifiedSrcDensity, &err);
1584 0 : handleErr(err);
1585 : }
1586 :
1587 : // Copy unifiedSrcValid if it exists
1588 0 : if (unifiedSrcValid == nullptr)
1589 : {
1590 : // Alloc dummy device RAM
1591 0 : (*unifiedSrcValidCL) =
1592 0 : clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
1593 0 : handleErr(err);
1594 : }
1595 : else
1596 : {
1597 : // Alloc & copy all validity data
1598 0 : (*unifiedSrcValidCL) = clCreateBuffer(
1599 : warper->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, validSz,
1600 : unifiedSrcValid, &err);
1601 0 : handleErr(err);
1602 : }
1603 :
1604 : // Set the band validity usage
1605 0 : if (useValid)
1606 : {
1607 0 : (*useBandSrcValidCL) = clCreateBuffer(
1608 : warper->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
1609 0 : sizeof(char) * warper->numBands, warper->useBandSrcValid, &err);
1610 0 : handleErr(err);
1611 : }
1612 : else
1613 : {
1614 : // Make a fake image so we don't have a NULL pointer
1615 0 : (*useBandSrcValidCL) =
1616 0 : clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
1617 0 : handleErr(err);
1618 : }
1619 :
1620 : // Do a more thorough check for validity
1621 0 : if (useValid)
1622 : {
1623 : int i;
1624 0 : useValid = FALSE;
1625 0 : for (i = 0; i < warper->numBands; ++i)
1626 0 : if (warper->useBandSrcValid[i])
1627 0 : useValid = TRUE;
1628 : }
1629 :
1630 : // And the validity mask if needed
1631 0 : if (useValid)
1632 : {
1633 0 : (*nBandSrcValidCL) = clCreateBuffer(
1634 : warper->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
1635 0 : warper->numBands * validSz, warper->nBandSrcValid, &err);
1636 0 : handleErr(err);
1637 : }
1638 : else
1639 : {
1640 : // Make a fake image so we don't have a NULL pointer
1641 0 : (*nBandSrcValidCL) =
1642 0 : clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
1643 0 : handleErr(err);
1644 : }
1645 :
1646 : // Set up arguments
1647 0 : if (warper->kern1 != nullptr)
1648 : {
1649 0 : handleErr(err = clSetKernelArg(warper->kern1, 3, sizeof(cl_mem),
1650 : unifiedSrcDensityCL));
1651 0 : handleErr(err = clSetKernelArg(warper->kern1, 4, sizeof(cl_mem),
1652 : unifiedSrcValidCL));
1653 0 : handleErr(err = clSetKernelArg(warper->kern1, 5, sizeof(cl_mem),
1654 : useBandSrcValidCL));
1655 0 : handleErr(err = clSetKernelArg(warper->kern1, 6, sizeof(cl_mem),
1656 : nBandSrcValidCL));
1657 : }
1658 0 : if (warper->kern4 != nullptr)
1659 : {
1660 0 : handleErr(err = clSetKernelArg(warper->kern4, 3, sizeof(cl_mem),
1661 : unifiedSrcDensityCL));
1662 0 : handleErr(err = clSetKernelArg(warper->kern4, 4, sizeof(cl_mem),
1663 : unifiedSrcValidCL));
1664 0 : handleErr(err = clSetKernelArg(warper->kern4, 5, sizeof(cl_mem),
1665 : useBandSrcValidCL));
1666 0 : handleErr(err = clSetKernelArg(warper->kern4, 6, sizeof(cl_mem),
1667 : nBandSrcValidCL));
1668 : }
1669 :
1670 0 : return CL_SUCCESS;
1671 : }
1672 :
1673 : /*
1674 : Here we set the per-band raster data. First priority is the real raster data,
1675 : of course. Then, if applicable, we set the additional image channel. Once this
1676 : data is copied to the device, it can be freed on the host, so that is done
1677 : here. Finally the appropriate kernel arguments are set.
1678 :
1679 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1680 : */
1681 0 : static cl_int set_src_rast_data(struct oclWarper *warper, int iNum, size_t sz,
1682 : cl_channel_order chOrder, cl_mem *srcReal,
1683 : cl_mem *srcImag)
1684 : {
1685 : cl_image_format imgFmt;
1686 0 : cl_int err = CL_SUCCESS;
1687 0 : int useImagWork =
1688 0 : warper->imagWork.v != nullptr && warper->imagWork.v[iNum] != nullptr;
1689 :
1690 : // Set up image vars
1691 0 : imgFmt.image_channel_order = chOrder;
1692 0 : imgFmt.image_channel_data_type = warper->imageFormat;
1693 :
1694 : // Create & copy the source image
1695 : #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
1696 : #pragma GCC diagnostic push
1697 : #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
1698 : #endif
1699 :
1700 0 : (*srcReal) = clCreateImage2D(
1701 : warper->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &imgFmt,
1702 0 : static_cast<size_t>(warper->srcWidth),
1703 0 : static_cast<size_t>(warper->srcHeight), sz * warper->srcWidth,
1704 0 : warper->realWork.v[iNum], &err);
1705 0 : handleErr(err);
1706 :
1707 : // And the source image parts if needed
1708 0 : if (useImagWork)
1709 : {
1710 0 : (*srcImag) = clCreateImage2D(
1711 : warper->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &imgFmt,
1712 0 : static_cast<size_t>(warper->srcWidth),
1713 0 : static_cast<size_t>(warper->srcHeight), sz * warper->srcWidth,
1714 0 : warper->imagWork.v[iNum], &err);
1715 0 : handleErr(err);
1716 : }
1717 : else
1718 : {
1719 : // Make a fake image so we don't have a NULL pointer
1720 :
1721 : char dummyImageData[16];
1722 0 : (*srcImag) = clCreateImage2D(warper->context,
1723 : CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
1724 : &imgFmt, 1, 1, sz, dummyImageData, &err);
1725 :
1726 0 : handleErr(err);
1727 : }
1728 : #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
1729 : #pragma GCC diagnostic pop
1730 : #endif
1731 :
1732 : // Free the source memory, now that it's copied we don't need it
1733 0 : freeCLMem(warper->realWorkCL[iNum], warper->realWork.v[iNum]);
1734 0 : if (warper->imagWork.v != nullptr)
1735 : {
1736 0 : freeCLMem(warper->imagWorkCL[iNum], warper->imagWork.v[iNum]);
1737 : }
1738 :
1739 : // Set up per-band arguments
1740 0 : if (warper->kern1 != nullptr)
1741 : {
1742 0 : handleErr(
1743 : err = clSetKernelArg(warper->kern1, 1, sizeof(cl_mem), srcReal));
1744 0 : handleErr(
1745 : err = clSetKernelArg(warper->kern1, 2, sizeof(cl_mem), srcImag));
1746 : }
1747 0 : if (warper->kern4 != nullptr)
1748 : {
1749 0 : handleErr(
1750 : err = clSetKernelArg(warper->kern4, 1, sizeof(cl_mem), srcReal));
1751 0 : handleErr(
1752 : err = clSetKernelArg(warper->kern4, 2, sizeof(cl_mem), srcImag));
1753 : }
1754 :
1755 0 : return CL_SUCCESS;
1756 : }
1757 :
1758 : /*
1759 : Set the destination data for the raster. Although it's the output, it still
1760 : is copied to the device because some blending is done there. First the real
1761 : data is allocated and copied, then the imag data is allocated and copied if
1762 : needed. They are then set as the appropriate arguments to the kernel.
1763 :
1764 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1765 : */
1766 0 : static cl_int set_dst_rast_data(struct oclWarper *warper, int iImg, size_t sz,
1767 : cl_mem *dstReal, cl_mem *dstImag)
1768 : {
1769 0 : cl_int err = CL_SUCCESS;
1770 0 : sz *= warper->dstWidth * warper->dstHeight;
1771 :
1772 : // Copy the dst real data
1773 0 : (*dstReal) = clCreateBuffer(warper->context,
1774 : CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sz,
1775 0 : warper->dstRealWork.v[iImg], &err);
1776 0 : handleErr(err);
1777 :
1778 : // Copy the dst imag data if exists
1779 0 : if (warper->dstImagWork.v != nullptr &&
1780 0 : warper->dstImagWork.v[iImg] != nullptr)
1781 : {
1782 0 : (*dstImag) = clCreateBuffer(warper->context,
1783 : CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
1784 0 : sz, warper->dstImagWork.v[iImg], &err);
1785 0 : handleErr(err);
1786 : }
1787 : else
1788 : {
1789 0 : (*dstImag) = clCreateBuffer(warper->context, CL_MEM_READ_WRITE, 1,
1790 : nullptr, &err);
1791 0 : handleErr(err);
1792 : }
1793 :
1794 : // Set up per-band arguments
1795 0 : if (warper->kern1 != nullptr)
1796 : {
1797 0 : handleErr(
1798 : err = clSetKernelArg(warper->kern1, 7, sizeof(cl_mem), dstReal));
1799 0 : handleErr(
1800 : err = clSetKernelArg(warper->kern1, 8, sizeof(cl_mem), dstImag));
1801 : }
1802 0 : if (warper->kern4 != nullptr)
1803 : {
1804 0 : handleErr(
1805 : err = clSetKernelArg(warper->kern4, 7, sizeof(cl_mem), dstReal));
1806 0 : handleErr(
1807 : err = clSetKernelArg(warper->kern4, 8, sizeof(cl_mem), dstImag));
1808 : }
1809 :
1810 0 : return CL_SUCCESS;
1811 : }
1812 :
1813 : /*
1814 : Read the final raster data back from the graphics card to working memory. This
1815 : copies both the real memory and the imag memory if appropriate.
1816 :
1817 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1818 : */
1819 0 : static cl_int get_dst_rast_data(struct oclWarper *warper, int iImg,
1820 : size_t wordSz, cl_mem dstReal, cl_mem dstImag)
1821 : {
1822 0 : cl_int err = CL_SUCCESS;
1823 0 : size_t sz = warper->dstWidth * warper->dstHeight * wordSz;
1824 :
1825 : // Copy from dev into working memory
1826 0 : handleErr(err = clEnqueueReadBuffer(warper->queue, dstReal, CL_FALSE, 0, sz,
1827 : warper->dstRealWork.v[iImg], 0, nullptr,
1828 : nullptr));
1829 :
1830 : // If we are expecting the imag channel, then copy it back also
1831 0 : if (warper->dstImagWork.v != nullptr &&
1832 0 : warper->dstImagWork.v[iImg] != nullptr)
1833 : {
1834 0 : handleErr(err = clEnqueueReadBuffer(warper->queue, dstImag, CL_FALSE, 0,
1835 : sz, warper->dstImagWork.v[iImg], 0,
1836 : nullptr, nullptr));
1837 : }
1838 :
1839 : // The copy requests were non-blocking, so we'll need to make sure they
1840 : // finish.
1841 0 : handleErr(err = clFinish(warper->queue));
1842 :
1843 0 : return CL_SUCCESS;
1844 : }
1845 :
1846 : /*
1847 : Set the destination image density & validity mask on the device. This is used
1848 : to blend the final output image with the existing buffer. This handles the
1849 : unified structures that apply to all bands. After the buffers are created and
1850 : copied, they are set as kernel arguments.
1851 :
1852 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1853 : */
1854 0 : static cl_int set_dst_data(struct oclWarper *warper, cl_mem *dstDensityCL,
1855 : cl_mem *dstValidCL, cl_mem *dstNoDataRealCL,
1856 : float *dstDensity, unsigned int *dstValid,
1857 : float *dstNoDataReal)
1858 : {
1859 0 : cl_int err = CL_SUCCESS;
1860 0 : size_t sz = warper->dstWidth * warper->dstHeight;
1861 :
1862 : // Copy the no-data value(s)
1863 0 : if (dstNoDataReal == nullptr)
1864 : {
1865 0 : (*dstNoDataRealCL) =
1866 0 : clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
1867 0 : handleErr(err);
1868 : }
1869 : else
1870 : {
1871 0 : (*dstNoDataRealCL) = clCreateBuffer(
1872 : warper->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
1873 0 : sizeof(float) * warper->numBands, dstNoDataReal, &err);
1874 0 : handleErr(err);
1875 : }
1876 :
1877 : // Copy unifiedSrcDensity if it exists
1878 0 : if (dstDensity == nullptr)
1879 : {
1880 0 : (*dstDensityCL) =
1881 0 : clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
1882 0 : handleErr(err);
1883 : }
1884 : else
1885 : {
1886 0 : (*dstDensityCL) = clCreateBuffer(
1887 : warper->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
1888 : sizeof(float) * sz, dstDensity, &err);
1889 0 : handleErr(err);
1890 : }
1891 :
1892 : // Copy unifiedSrcValid if it exists
1893 0 : if (dstValid == nullptr)
1894 : {
1895 0 : (*dstValidCL) =
1896 0 : clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
1897 0 : handleErr(err);
1898 : }
1899 : else
1900 : {
1901 0 : (*dstValidCL) = clCreateBuffer(
1902 : warper->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
1903 0 : sizeof(int) * ((31 + sz) >> 5), dstValid, &err);
1904 0 : handleErr(err);
1905 : }
1906 :
1907 : // Set up arguments
1908 0 : if (warper->kern1 != nullptr)
1909 : {
1910 0 : handleErr(err = clSetKernelArg(warper->kern1, 9, sizeof(cl_mem),
1911 : dstNoDataRealCL));
1912 0 : handleErr(err = clSetKernelArg(warper->kern1, 10, sizeof(cl_mem),
1913 : dstDensityCL));
1914 0 : handleErr(err = clSetKernelArg(warper->kern1, 11, sizeof(cl_mem),
1915 : dstValidCL));
1916 : }
1917 0 : if (warper->kern4 != nullptr)
1918 : {
1919 0 : handleErr(err = clSetKernelArg(warper->kern4, 9, sizeof(cl_mem),
1920 : dstNoDataRealCL));
1921 0 : handleErr(err = clSetKernelArg(warper->kern4, 10, sizeof(cl_mem),
1922 : dstDensityCL));
1923 0 : handleErr(err = clSetKernelArg(warper->kern4, 11, sizeof(cl_mem),
1924 : dstValidCL));
1925 : }
1926 :
1927 0 : return CL_SUCCESS;
1928 : }
1929 :
1930 : /*
1931 : Go ahead and execute the kernel. This handles some housekeeping stuff like the
1932 : run dimensions. When running in debug mode, it times the kernel call and prints
1933 : the execution time.
1934 :
1935 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1936 : */
1937 0 : static cl_int execute_kern(struct oclWarper *warper, cl_kernel kern,
1938 : size_t loc_size)
1939 : {
1940 0 : cl_int err = CL_SUCCESS;
1941 : cl_event ev;
1942 : size_t ceil_runs[2];
1943 : size_t group_size[2];
1944 : #ifdef DEBUG_OPENCL
1945 : size_t start_time = 0;
1946 : size_t end_time;
1947 : const char *vecTxt = "";
1948 : #endif
1949 :
1950 : // Use a likely X-dimension which is a power of 2
1951 0 : if (loc_size >= 512)
1952 0 : group_size[0] = 32;
1953 0 : else if (loc_size >= 64)
1954 0 : group_size[0] = 16;
1955 0 : else if (loc_size > 8)
1956 0 : group_size[0] = 8;
1957 : else
1958 0 : group_size[0] = 1;
1959 :
1960 0 : if (group_size[0] > loc_size)
1961 0 : group_size[1] = group_size[0] / loc_size;
1962 : else
1963 0 : group_size[1] = 1;
1964 :
1965 : // Round up num_runs to find the dim of the block of pixels we'll be
1966 : // processing
1967 0 : if (warper->dstWidth % group_size[0])
1968 0 : ceil_runs[0] =
1969 0 : warper->dstWidth + group_size[0] - warper->dstWidth % group_size[0];
1970 : else
1971 0 : ceil_runs[0] = warper->dstWidth;
1972 :
1973 0 : if (warper->dstHeight % group_size[1])
1974 0 : ceil_runs[1] = warper->dstHeight + group_size[1] -
1975 0 : warper->dstHeight % group_size[1];
1976 : else
1977 0 : ceil_runs[1] = warper->dstHeight;
1978 :
1979 : #ifdef DEBUG_OPENCL
1980 : handleErr(err = clSetCommandQueueProperty(
1981 : warper->queue, CL_QUEUE_PROFILING_ENABLE, CL_TRUE, nullptr));
1982 : #endif
1983 :
1984 : // Run the calculation by enqueuing it and forcing the
1985 : // command queue to complete the task
1986 0 : handleErr(err = clEnqueueNDRangeKernel(warper->queue, kern, 2, nullptr,
1987 : ceil_runs, group_size, 0, nullptr,
1988 : &ev));
1989 0 : handleErr(err = clFinish(warper->queue));
1990 :
1991 : #ifdef DEBUG_OPENCL
1992 : handleErr(err = clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_START,
1993 : sizeof(size_t), &start_time,
1994 : nullptr));
1995 : handleErr(err =
1996 : clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_END,
1997 : sizeof(size_t), &end_time, nullptr));
1998 : assert(end_time != 0);
1999 : assert(start_time != 0);
2000 : if (kern == warper->kern4)
2001 : vecTxt = "(vec)";
2002 :
2003 : CPLDebug("OpenCL", "Kernel Time: %6s %10lu", vecTxt,
2004 : static_cast<long int>((end_time - start_time) / 100000));
2005 : #endif
2006 :
2007 0 : handleErr(err = clReleaseEvent(ev));
2008 0 : return CL_SUCCESS;
2009 : }
2010 :
2011 : /*
2012 : Copy data from a raw source to the warper's working memory. If the imag
2013 : channel is expected, then the data will be de-interlaced into component blocks
2014 : of memory.
2015 :
2016 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2017 : */
2018 0 : static cl_int set_img_data(struct oclWarper *warper, void *srcImgData,
2019 : unsigned int width, unsigned int height, int isSrc,
2020 : unsigned int bandNum, void **dstRealImgs,
2021 : void **dstImagImgs)
2022 : {
2023 0 : unsigned int imgChSize = warper->imgChSize1;
2024 : unsigned int iSrcY, i;
2025 0 : unsigned int vecOff = 0;
2026 0 : unsigned int imgNum = bandNum;
2027 0 : void *dstReal = nullptr;
2028 0 : void *dstImag = nullptr;
2029 :
2030 : // Handle vector if needed
2031 0 : if (warper->useVec &&
2032 0 : static_cast<int>(bandNum) < warper->numBands - warper->numBands % 4)
2033 : {
2034 0 : imgChSize = warper->imgChSize4;
2035 0 : vecOff = bandNum % 4;
2036 0 : imgNum = bandNum / 4;
2037 : }
2038 0 : else if (warper->useVec)
2039 : {
2040 0 : imgNum = bandNum / 4 + bandNum % 4;
2041 : }
2042 :
2043 : // Set the images as needed
2044 0 : dstReal = dstRealImgs[imgNum];
2045 0 : if (dstImagImgs == nullptr)
2046 0 : dstImag = nullptr;
2047 : else
2048 0 : dstImag = dstImagImgs[imgNum];
2049 :
2050 : // Set stuff for dst imgs
2051 0 : if (!isSrc)
2052 : {
2053 0 : vecOff *= height * width;
2054 0 : imgChSize = 1;
2055 : }
2056 :
2057 : // Copy values as needed
2058 0 : if (warper->imagWorkCL == nullptr && !(warper->useVec && isSrc))
2059 : {
2060 : // Set memory size & location depending on the data type
2061 : // This is the ideal code path for speed
2062 0 : switch (warper->imageFormat)
2063 : {
2064 0 : case CL_UNORM_INT8:
2065 : {
2066 0 : unsigned char *realDst =
2067 0 : &((static_cast<unsigned char *>(dstReal))[vecOff]);
2068 0 : memcpy(realDst, srcImgData,
2069 0 : width * height * sizeof(unsigned char));
2070 0 : break;
2071 : }
2072 0 : case CL_SNORM_INT8:
2073 : {
2074 0 : char *realDst = &((static_cast<char *>(dstReal))[vecOff]);
2075 0 : memcpy(realDst, srcImgData, width * height * sizeof(char));
2076 0 : break;
2077 : }
2078 0 : case CL_UNORM_INT16:
2079 : {
2080 0 : unsigned short *realDst =
2081 0 : &((static_cast<unsigned short *>(dstReal))[vecOff]);
2082 0 : memcpy(realDst, srcImgData,
2083 0 : width * height * sizeof(unsigned short));
2084 0 : break;
2085 : }
2086 0 : case CL_SNORM_INT16:
2087 : {
2088 0 : short *realDst = &((static_cast<short *>(dstReal))[vecOff]);
2089 0 : memcpy(realDst, srcImgData, width * height * sizeof(short));
2090 0 : break;
2091 : }
2092 0 : case CL_FLOAT:
2093 : {
2094 0 : float *realDst = &((static_cast<float *>(dstReal))[vecOff]);
2095 0 : memcpy(realDst, srcImgData, width * height * sizeof(float));
2096 0 : break;
2097 : }
2098 0 : }
2099 : }
2100 0 : else if (warper->imagWorkCL == nullptr)
2101 : {
2102 : // We need to space the values due to OpenCL implementation reasons
2103 0 : for (iSrcY = 0; iSrcY < height; iSrcY++)
2104 : {
2105 0 : int pxOff = width * iSrcY;
2106 0 : int imgOff = imgChSize * pxOff + vecOff;
2107 : // Copy & deinterleave interleaved data
2108 0 : switch (warper->imageFormat)
2109 : {
2110 0 : case CL_UNORM_INT8:
2111 : {
2112 0 : unsigned char *realDst =
2113 0 : &((static_cast<unsigned char *>(dstReal))[imgOff]);
2114 0 : unsigned char *dataSrc =
2115 0 : &((static_cast<unsigned char *>(srcImgData))[pxOff]);
2116 0 : for (i = 0; i < width; ++i)
2117 0 : realDst[imgChSize * i] = dataSrc[i];
2118 : }
2119 0 : break;
2120 0 : case CL_SNORM_INT8:
2121 : {
2122 0 : char *realDst = &((static_cast<char *>(dstReal))[imgOff]);
2123 0 : char *dataSrc = &((static_cast<char *>(srcImgData))[pxOff]);
2124 0 : for (i = 0; i < width; ++i)
2125 0 : realDst[imgChSize * i] = dataSrc[i];
2126 : }
2127 0 : break;
2128 0 : case CL_UNORM_INT16:
2129 : {
2130 0 : unsigned short *realDst =
2131 0 : &((static_cast<unsigned short *>(dstReal))[imgOff]);
2132 0 : unsigned short *dataSrc =
2133 0 : &((static_cast<unsigned short *>(srcImgData))[pxOff]);
2134 0 : for (i = 0; i < width; ++i)
2135 0 : realDst[imgChSize * i] = dataSrc[i];
2136 : }
2137 0 : break;
2138 0 : case CL_SNORM_INT16:
2139 : {
2140 0 : short *realDst = &((static_cast<short *>(dstReal))[imgOff]);
2141 0 : short *dataSrc =
2142 0 : &((static_cast<short *>(srcImgData))[pxOff]);
2143 0 : for (i = 0; i < width; ++i)
2144 0 : realDst[imgChSize * i] = dataSrc[i];
2145 : }
2146 0 : break;
2147 0 : case CL_FLOAT:
2148 : {
2149 0 : float *realDst = &((static_cast<float *>(dstReal))[imgOff]);
2150 0 : float *dataSrc =
2151 0 : &((static_cast<float *>(srcImgData))[pxOff]);
2152 0 : for (i = 0; i < width; ++i)
2153 0 : realDst[imgChSize * i] = dataSrc[i];
2154 : }
2155 0 : break;
2156 : }
2157 : }
2158 : }
2159 : else
2160 : {
2161 0 : assert(dstImag);
2162 :
2163 : // Copy, deinterleave, & space interleaved data
2164 0 : for (iSrcY = 0; iSrcY < height; iSrcY++)
2165 : {
2166 0 : int pxOff = width * iSrcY;
2167 0 : int imgOff = imgChSize * pxOff + vecOff;
2168 : // Copy & deinterleave interleaved data
2169 0 : switch (warper->imageFormat)
2170 : {
2171 0 : case CL_FLOAT:
2172 : {
2173 0 : float *realDst = &((static_cast<float *>(dstReal))[imgOff]);
2174 0 : float *imagDst = &((static_cast<float *>(dstImag))[imgOff]);
2175 0 : float *dataSrc =
2176 0 : &((static_cast<float *>(srcImgData))[pxOff]);
2177 0 : for (i = 0; i < width; ++i)
2178 : {
2179 0 : realDst[imgChSize * i] = dataSrc[i * 2];
2180 0 : imagDst[imgChSize * i] = dataSrc[i * 2 + 1];
2181 : }
2182 : }
2183 0 : break;
2184 0 : case CL_SNORM_INT8:
2185 : {
2186 0 : char *realDst = &((static_cast<char *>(dstReal))[imgOff]);
2187 0 : char *imagDst = &((static_cast<char *>(dstImag))[imgOff]);
2188 0 : char *dataSrc = &((static_cast<char *>(srcImgData))[pxOff]);
2189 0 : for (i = 0; i < width; ++i)
2190 : {
2191 0 : realDst[imgChSize * i] = dataSrc[i * 2];
2192 0 : imagDst[imgChSize * i] = dataSrc[i * 2 + 1];
2193 : }
2194 : }
2195 0 : break;
2196 0 : case CL_UNORM_INT8:
2197 : {
2198 0 : unsigned char *realDst =
2199 0 : &((static_cast<unsigned char *>(dstReal))[imgOff]);
2200 0 : unsigned char *imagDst =
2201 0 : &((static_cast<unsigned char *>(dstImag))[imgOff]);
2202 0 : unsigned char *dataSrc =
2203 0 : &((static_cast<unsigned char *>(srcImgData))[pxOff]);
2204 0 : for (i = 0; i < width; ++i)
2205 : {
2206 0 : realDst[imgChSize * i] = dataSrc[i * 2];
2207 0 : imagDst[imgChSize * i] = dataSrc[i * 2 + 1];
2208 : }
2209 : }
2210 0 : break;
2211 0 : case CL_SNORM_INT16:
2212 : {
2213 0 : short *realDst = &((static_cast<short *>(dstReal))[imgOff]);
2214 0 : short *imagDst = &((static_cast<short *>(dstImag))[imgOff]);
2215 0 : short *dataSrc =
2216 0 : &((static_cast<short *>(srcImgData))[pxOff]);
2217 0 : for (i = 0; i < width; ++i)
2218 : {
2219 0 : realDst[imgChSize * i] = dataSrc[i * 2];
2220 0 : imagDst[imgChSize * i] = dataSrc[i * 2 + 1];
2221 : }
2222 : }
2223 0 : break;
2224 0 : case CL_UNORM_INT16:
2225 : {
2226 0 : unsigned short *realDst =
2227 0 : &((static_cast<unsigned short *>(dstReal))[imgOff]);
2228 0 : unsigned short *imagDst =
2229 0 : &((static_cast<unsigned short *>(dstImag))[imgOff]);
2230 0 : unsigned short *dataSrc =
2231 0 : &((static_cast<unsigned short *>(srcImgData))[pxOff]);
2232 0 : for (i = 0; i < width; ++i)
2233 : {
2234 0 : realDst[imgChSize * i] = dataSrc[i * 2];
2235 0 : imagDst[imgChSize * i] = dataSrc[i * 2 + 1];
2236 : }
2237 : }
2238 0 : break;
2239 : }
2240 : }
2241 : }
2242 :
2243 0 : return CL_SUCCESS;
2244 : }
2245 :
2246 : /*
2247 : Creates the struct which inits & contains the OpenCL context & environment.
2248 : Inits wired(?) space to buffer the image in host RAM. Chooses the OpenCL
2249 : device, perhaps the user can choose it later? This would also choose the
2250 : appropriate OpenCL image format (R, RG, RGBA, or multiples thereof). Space
2251 : for metadata can be allocated as required, though.
2252 :
2253 : Supported image formats are:
2254 : CL_FLOAT, CL_SNORM_INT8, CL_UNORM_INT8, CL_SNORM_INT16, CL_UNORM_INT16
2255 : 32-bit int formats won't keep precision when converted to floats internally
2256 : and doubles are generally not supported on the GPU image formats.
2257 : */
2258 0 : struct oclWarper *GDALWarpKernelOpenCL_createEnv(
2259 : int srcWidth, int srcHeight, int dstWidth, int dstHeight,
2260 : cl_channel_type imageFormat, int numBands, int coordMult, int useImag,
2261 : int useBandSrcValid, CPL_UNUSED float *fDstDensity, double *dfDstNoDataReal,
2262 : OCLResampAlg resampAlg, cl_int *clErr)
2263 : {
2264 : struct oclWarper *warper;
2265 : int i;
2266 0 : size_t maxWidth = 0, maxHeight = 0;
2267 0 : cl_int err = CL_SUCCESS;
2268 : size_t fmtSize, sz;
2269 : cl_device_id device;
2270 : cl_bool bool_flag;
2271 0 : OCLVendor eCLVendor = VENDOR_OTHER;
2272 :
2273 : // Do we have a suitable OpenCL device?
2274 0 : device = get_device(&eCLVendor);
2275 0 : if (device == nullptr)
2276 0 : return nullptr;
2277 :
2278 0 : err = clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool),
2279 : &bool_flag, &sz);
2280 0 : if (err != CL_SUCCESS || !bool_flag)
2281 : {
2282 0 : CPLDebug("OpenCL", "No image support on selected device.");
2283 0 : return nullptr;
2284 : }
2285 :
2286 : // Set up warper environment.
2287 : warper =
2288 0 : static_cast<struct oclWarper *>(CPLCalloc(1, sizeof(struct oclWarper)));
2289 :
2290 0 : warper->eCLVendor = eCLVendor;
2291 :
2292 : // Init passed vars
2293 0 : warper->srcWidth = srcWidth;
2294 0 : warper->srcHeight = srcHeight;
2295 0 : warper->dstWidth = dstWidth;
2296 0 : warper->dstHeight = dstHeight;
2297 :
2298 0 : warper->coordMult = coordMult;
2299 0 : warper->numBands = numBands;
2300 0 : warper->imageFormat = imageFormat;
2301 0 : warper->resampAlg = resampAlg;
2302 :
2303 0 : warper->useUnifiedSrcDensity = FALSE;
2304 0 : warper->useUnifiedSrcValid = FALSE;
2305 0 : warper->useDstDensity = FALSE;
2306 0 : warper->useDstValid = FALSE;
2307 :
2308 0 : warper->imagWorkCL = nullptr;
2309 0 : warper->dstImagWorkCL = nullptr;
2310 0 : warper->useBandSrcValidCL = nullptr;
2311 0 : warper->useBandSrcValid = nullptr;
2312 0 : warper->nBandSrcValidCL = nullptr;
2313 0 : warper->nBandSrcValid = nullptr;
2314 0 : warper->fDstNoDataRealCL = nullptr;
2315 0 : warper->fDstNoDataReal = nullptr;
2316 0 : warper->kern1 = nullptr;
2317 0 : warper->kern4 = nullptr;
2318 :
2319 0 : warper->dev = device;
2320 :
2321 0 : warper->context =
2322 0 : clCreateContext(nullptr, 1, &(warper->dev), nullptr, nullptr, &err);
2323 0 : handleErrGoto(err, error_label);
2324 : #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
2325 : #pragma GCC diagnostic push
2326 : #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2327 : #endif
2328 0 : warper->queue = clCreateCommandQueue(warper->context, warper->dev, 0, &err);
2329 : #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
2330 : #pragma GCC diagnostic pop
2331 : #endif
2332 0 : handleErrGoto(err, error_label);
2333 :
2334 : // Ensure that we hand handle imagery of these dimensions
2335 0 : err = clGetDeviceInfo(warper->dev, CL_DEVICE_IMAGE2D_MAX_WIDTH,
2336 : sizeof(size_t), &maxWidth, &sz);
2337 0 : handleErrGoto(err, error_label);
2338 0 : err = clGetDeviceInfo(warper->dev, CL_DEVICE_IMAGE2D_MAX_HEIGHT,
2339 : sizeof(size_t), &maxHeight, &sz);
2340 0 : handleErrGoto(err, error_label);
2341 0 : if (maxWidth < static_cast<size_t>(srcWidth) ||
2342 0 : maxHeight < static_cast<size_t>(srcHeight))
2343 : {
2344 0 : err = CL_INVALID_IMAGE_SIZE;
2345 0 : handleErrGoto(err, error_label);
2346 : }
2347 :
2348 : // Split bands into sets of four when possible
2349 : // Cubic runs slower as vector, so don't use it (probably register pressure)
2350 : // Feel free to do more testing and come up with more precise case
2351 : // statements
2352 0 : if (numBands < 4 || resampAlg == OCL_Cubic)
2353 : {
2354 0 : warper->numImages = numBands;
2355 0 : warper->useVec = FALSE;
2356 : }
2357 : else
2358 : {
2359 0 : warper->numImages = numBands / 4 + numBands % 4;
2360 0 : warper->useVec = TRUE;
2361 : }
2362 :
2363 : // Make the pointer space for the real images
2364 0 : warper->realWorkCL =
2365 0 : static_cast<cl_mem *>(CPLCalloc(sizeof(cl_mem), warper->numImages));
2366 0 : warper->dstRealWorkCL =
2367 0 : static_cast<cl_mem *>(CPLCalloc(sizeof(cl_mem), warper->numImages));
2368 :
2369 : // Make space for the per-channel Imag data (if exists)
2370 0 : if (useImag)
2371 : {
2372 0 : warper->imagWorkCL =
2373 0 : static_cast<cl_mem *>(CPLCalloc(sizeof(cl_mem), warper->numImages));
2374 0 : warper->dstImagWorkCL =
2375 0 : static_cast<cl_mem *>(CPLCalloc(sizeof(cl_mem), warper->numImages));
2376 : }
2377 :
2378 : // Make space for the per-band BandSrcValid data (if exists)
2379 0 : if (useBandSrcValid)
2380 : {
2381 : // 32 bits in the mask
2382 0 : sz = warper->numBands *
2383 0 : ((31 + warper->srcWidth * warper->srcHeight) >> 5);
2384 :
2385 : // Allocate some space for the validity of the validity mask
2386 : void *useBandSrcValidTab[1];
2387 : cl_mem useBandSrcValidCLTab[1];
2388 0 : err = alloc_pinned_mem(warper, 0, warper->numBands * sizeof(char),
2389 : useBandSrcValidTab, useBandSrcValidCLTab);
2390 0 : warper->useBandSrcValid = static_cast<char *>(useBandSrcValidTab[0]);
2391 0 : warper->useBandSrcValidCL = useBandSrcValidCLTab[0];
2392 0 : handleErrGoto(err, error_label);
2393 :
2394 0 : for (i = 0; i < warper->numBands; ++i)
2395 0 : warper->useBandSrcValid[i] = FALSE;
2396 :
2397 : // Allocate one array for all the band validity masks.
2398 : // Remember that the masks don't use much memory (they're bitwise).
2399 : void *nBandSrcValidTab[1];
2400 : cl_mem nBandSrcValidCLTab[1];
2401 0 : err = alloc_pinned_mem(warper, 0, sz * sizeof(int), nBandSrcValidTab,
2402 : nBandSrcValidCLTab);
2403 0 : warper->nBandSrcValid = static_cast<float *>(nBandSrcValidTab[0]);
2404 0 : warper->nBandSrcValidCL = nBandSrcValidCLTab[0];
2405 0 : handleErrGoto(err, error_label);
2406 : }
2407 :
2408 : // Make space for the per-band
2409 0 : if (dfDstNoDataReal != nullptr)
2410 : {
2411 : void *fDstNoDataRealTab[1];
2412 : cl_mem fDstNoDataRealCLTab[1];
2413 0 : alloc_pinned_mem(warper, 0, warper->numBands, fDstNoDataRealTab,
2414 : fDstNoDataRealCLTab);
2415 0 : warper->fDstNoDataReal = static_cast<float *>(fDstNoDataRealTab[0]);
2416 0 : warper->fDstNoDataRealCL = fDstNoDataRealCLTab[0];
2417 :
2418 : // Copy over values
2419 0 : for (i = 0; i < warper->numBands; ++i)
2420 0 : warper->fDstNoDataReal[i] = static_cast<float>(dfDstNoDataReal[i]);
2421 : }
2422 :
2423 : // Alloc working host image memory
2424 : // We'll be copying into these buffers soon
2425 0 : switch (imageFormat)
2426 : {
2427 0 : case CL_FLOAT:
2428 0 : err = alloc_working_arr(warper, sizeof(float *), sizeof(float),
2429 : &fmtSize);
2430 0 : break;
2431 0 : case CL_SNORM_INT8:
2432 0 : err = alloc_working_arr(warper, sizeof(char *), sizeof(char),
2433 : &fmtSize);
2434 0 : break;
2435 0 : case CL_UNORM_INT8:
2436 0 : err = alloc_working_arr(warper, sizeof(unsigned char *),
2437 : sizeof(unsigned char), &fmtSize);
2438 0 : break;
2439 0 : case CL_SNORM_INT16:
2440 0 : err = alloc_working_arr(warper, sizeof(short *), sizeof(short),
2441 : &fmtSize);
2442 0 : break;
2443 0 : case CL_UNORM_INT16:
2444 0 : err = alloc_working_arr(warper, sizeof(unsigned short *),
2445 : sizeof(unsigned short), &fmtSize);
2446 0 : break;
2447 : }
2448 0 : handleErrGoto(err, error_label);
2449 :
2450 : // Find a good & compatible image channel order for the Lat/Long array.
2451 0 : err = set_supported_formats(warper, 2, &(warper->xyChOrder),
2452 : &(warper->xyChSize), CL_FLOAT);
2453 0 : handleErrGoto(err, error_label);
2454 :
2455 : // Set coordinate image dimensions
2456 0 : warper->xyWidth =
2457 0 : static_cast<int>(ceil((static_cast<float>(warper->dstWidth) +
2458 0 : static_cast<float>(warper->coordMult) - 1) /
2459 0 : static_cast<float>(warper->coordMult)));
2460 0 : warper->xyHeight =
2461 0 : static_cast<int>(ceil((static_cast<float>(warper->dstHeight) +
2462 0 : static_cast<float>(warper->coordMult) - 1) /
2463 0 : static_cast<float>(warper->coordMult)));
2464 :
2465 : // Alloc coord memory
2466 0 : sz = sizeof(float) * warper->xyChSize * warper->xyWidth * warper->xyHeight;
2467 : void *xyWorkTab[1];
2468 : cl_mem xyWorkCLTab[1];
2469 0 : err = alloc_pinned_mem(warper, 0, sz, xyWorkTab, xyWorkCLTab);
2470 0 : warper->xyWork = static_cast<float *>(xyWorkTab[0]);
2471 0 : warper->xyWorkCL = xyWorkCLTab[0];
2472 0 : handleErrGoto(err, error_label);
2473 :
2474 : // Ensure everything is finished allocating, copying, & mapping
2475 0 : err = clFinish(warper->queue);
2476 0 : handleErrGoto(err, error_label);
2477 :
2478 0 : (*clErr) = CL_SUCCESS;
2479 0 : return warper;
2480 :
2481 0 : error_label:
2482 0 : GDALWarpKernelOpenCL_deleteEnv(warper);
2483 0 : return nullptr;
2484 : }
2485 :
2486 : /*
2487 : Copy the validity mask for an image band to the warper.
2488 :
2489 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2490 : */
2491 0 : cl_int GDALWarpKernelOpenCL_setSrcValid(struct oclWarper *warper,
2492 : int *bandSrcValid, int bandNum)
2493 : {
2494 : // 32 bits in the mask
2495 0 : int stride = (31 + warper->srcWidth * warper->srcHeight) >> 5;
2496 :
2497 : // Copy bandSrcValid
2498 0 : assert(warper->nBandSrcValid != nullptr);
2499 0 : memcpy(&(warper->nBandSrcValid[bandNum * stride]), bandSrcValid,
2500 0 : sizeof(int) * stride);
2501 0 : warper->useBandSrcValid[bandNum] = TRUE;
2502 :
2503 0 : return CL_SUCCESS;
2504 : }
2505 :
2506 : /*
2507 : Sets the source image real & imag into the host memory so that it is
2508 : permuted (ex. RGBA) for better graphics card access.
2509 :
2510 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2511 : */
2512 0 : cl_int GDALWarpKernelOpenCL_setSrcImg(struct oclWarper *warper, void *imgData,
2513 : int bandNum)
2514 : {
2515 0 : void **imagWorkPtr = nullptr;
2516 :
2517 0 : if (warper->imagWorkCL != nullptr)
2518 0 : imagWorkPtr = warper->imagWork.v;
2519 :
2520 0 : return set_img_data(warper, imgData, warper->srcWidth, warper->srcHeight,
2521 0 : TRUE, bandNum, warper->realWork.v, imagWorkPtr);
2522 : }
2523 :
2524 : /*
2525 : Sets the destination image real & imag into the host memory so that it is
2526 : permuted (ex. RGBA) for better graphics card access.
2527 :
2528 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2529 : */
2530 0 : cl_int GDALWarpKernelOpenCL_setDstImg(struct oclWarper *warper, void *imgData,
2531 : int bandNum)
2532 : {
2533 0 : void **dstImagWorkPtr = nullptr;
2534 :
2535 0 : if (warper->dstImagWorkCL != nullptr)
2536 0 : dstImagWorkPtr = warper->dstImagWork.v;
2537 :
2538 0 : return set_img_data(warper, imgData, warper->dstWidth, warper->dstHeight,
2539 0 : FALSE, bandNum, warper->dstRealWork.v, dstImagWorkPtr);
2540 : }
2541 :
2542 : /*
2543 : Inputs the source coordinates for a row of the destination pixels. Invalid
2544 : coordinates are set as -99.0, which should be out of the image bounds. Sets
2545 : the coordinates as ready to be used in OpenCL image memory: interleaved and
2546 : minus the offset. By using image memory, we can use a smaller texture for
2547 : coordinates and use OpenCL's built-in interpolation to save memory.
2548 :
2549 : What it does: generates a smaller matrix of X/Y coordinate transformation
2550 : values from an original matrix. When bilinearly sampled in the GPU hardware,
2551 : the generated values are as close as possible to the original matrix.
2552 :
2553 : Complication: matrices have arbitrary dimensions and the sub-sampling factor
2554 : is an arbitrary integer greater than zero. Getting the edge cases right is
2555 : difficult.
2556 :
2557 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2558 : */
2559 0 : cl_int GDALWarpKernelOpenCL_setCoordRow(struct oclWarper *warper,
2560 : double *rowSrcX, double *rowSrcY,
2561 : double srcXOff, double srcYOff,
2562 : int *success, int rowNum)
2563 : {
2564 0 : int coordMult = warper->coordMult;
2565 0 : int width = warper->dstWidth;
2566 0 : int height = warper->dstHeight;
2567 0 : int xyWidth = warper->xyWidth;
2568 : int i;
2569 0 : int xyChSize = warper->xyChSize;
2570 0 : float *xyPtr, *xyPrevPtr = nullptr;
2571 0 : int lastRow = rowNum == height - 1;
2572 0 : double dstHeightMod = 1.0, dstWidthMod = 1.0;
2573 :
2574 : // Return if we're at an off row
2575 0 : if (!lastRow && rowNum % coordMult != 0)
2576 0 : return CL_SUCCESS;
2577 :
2578 : // Standard row, adjusted for the skipped rows
2579 0 : xyPtr = &(warper->xyWork[xyWidth * xyChSize * rowNum / coordMult]);
2580 :
2581 : // Find our row
2582 0 : if (lastRow)
2583 : {
2584 : // Setup for the final row
2585 0 : xyPtr = &(warper->xyWork[xyWidth * xyChSize * (warper->xyHeight - 1)]);
2586 0 : xyPrevPtr =
2587 0 : &(warper->xyWork[xyWidth * xyChSize * (warper->xyHeight - 2)]);
2588 :
2589 0 : if ((height - 1) % coordMult)
2590 0 : dstHeightMod = static_cast<double>(coordMult) /
2591 0 : static_cast<double>((height - 1) % coordMult);
2592 : }
2593 :
2594 : // Copy selected coordinates
2595 0 : for (i = 0; i < width; i += coordMult)
2596 : {
2597 0 : if (success[i])
2598 : {
2599 0 : xyPtr[0] = static_cast<float>(rowSrcX[i] - srcXOff);
2600 0 : xyPtr[1] = static_cast<float>(rowSrcY[i] - srcYOff);
2601 :
2602 0 : if (lastRow)
2603 : {
2604 : // Adjust bottom row so interpolator returns correct value
2605 0 : xyPtr[0] = static_cast<float>(
2606 0 : dstHeightMod * (xyPtr[0] - xyPrevPtr[0]) + xyPrevPtr[0]);
2607 0 : xyPtr[1] = static_cast<float>(
2608 0 : dstHeightMod * (xyPtr[1] - xyPrevPtr[1]) + xyPrevPtr[1]);
2609 : }
2610 : }
2611 : else
2612 : {
2613 0 : xyPtr[0] = -99.0f;
2614 0 : xyPtr[1] = -99.0f;
2615 : }
2616 :
2617 0 : xyPtr += xyChSize;
2618 0 : xyPrevPtr += xyChSize;
2619 : }
2620 :
2621 : // Copy remaining coordinate
2622 0 : if ((width - 1) % coordMult)
2623 : {
2624 0 : dstWidthMod = static_cast<double>(coordMult) /
2625 0 : static_cast<double>((width - 1) % coordMult);
2626 0 : xyPtr -= xyChSize;
2627 0 : xyPrevPtr -= xyChSize;
2628 : }
2629 : else
2630 : {
2631 0 : xyPtr -= xyChSize * 2;
2632 0 : xyPrevPtr -= xyChSize * 2;
2633 : }
2634 :
2635 0 : if (lastRow)
2636 : {
2637 0 : double origX = rowSrcX[width - 1] - srcXOff;
2638 0 : double origY = rowSrcY[width - 1] - srcYOff;
2639 0 : double a = 1.0, b = 1.0;
2640 :
2641 : // Calculate the needed x/y values using an equation from the OpenCL
2642 : // Spec section 8.2, solving for Ti1j1
2643 0 : if ((width - 1) % coordMult)
2644 0 : a = ((width - 1) % coordMult) / static_cast<double>(coordMult);
2645 :
2646 0 : if ((height - 1) % coordMult)
2647 0 : b = ((height - 1) % coordMult) / static_cast<double>(coordMult);
2648 :
2649 0 : xyPtr[xyChSize] = static_cast<float>(
2650 0 : (((1.0 - a) * (1.0 - b) * xyPrevPtr[0] +
2651 0 : a * (1.0 - b) * xyPrevPtr[xyChSize] + (1.0 - a) * b * xyPtr[0]) -
2652 0 : origX) /
2653 0 : (-a * b));
2654 :
2655 0 : xyPtr[xyChSize + 1] =
2656 0 : static_cast<float>((((1.0 - a) * (1.0 - b) * xyPrevPtr[1] +
2657 0 : a * (1.0 - b) * xyPrevPtr[xyChSize + 1] +
2658 0 : (1.0 - a) * b * xyPtr[1]) -
2659 0 : origY) /
2660 0 : (-a * b));
2661 : }
2662 : else
2663 : {
2664 : // Adjust last coordinate so interpolator returns correct value
2665 0 : xyPtr[xyChSize] = static_cast<float>(
2666 0 : dstWidthMod * (rowSrcX[width - 1] - srcXOff - xyPtr[0]) + xyPtr[0]);
2667 0 : xyPtr[xyChSize + 1] = static_cast<float>(
2668 0 : dstWidthMod * (rowSrcY[width - 1] - srcYOff - xyPtr[1]) + xyPtr[1]);
2669 : }
2670 :
2671 0 : return CL_SUCCESS;
2672 : }
2673 :
2674 : /*
2675 : Copies all data to the device RAM, frees the host RAM, runs the
2676 : appropriate resampling kernel, mallocs output space, & copies the data
2677 : back from the device RAM for each band. Also check to make sure that
2678 : setRow*() was called the appropriate number of times to init all image
2679 : data.
2680 :
2681 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2682 : */
2683 0 : cl_int GDALWarpKernelOpenCL_runResamp(
2684 : struct oclWarper *warper, float *unifiedSrcDensity,
2685 : unsigned int *unifiedSrcValid, float *dstDensity, unsigned int *dstValid,
2686 : double dfXScale, double dfYScale, double dfXFilter, double dfYFilter,
2687 : int nXRadius, int nYRadius, int nFiltInitX, int nFiltInitY)
2688 : {
2689 0 : int i, nextBandNum = 0, chSize = 1;
2690 0 : cl_int err = CL_SUCCESS;
2691 : cl_mem xy, unifiedSrcDensityCL, unifiedSrcValidCL;
2692 : cl_mem dstDensityCL, dstValidCL, dstNoDataRealCL;
2693 : cl_mem useBandSrcValidCL, nBandSrcValidCL;
2694 0 : size_t groupSize, wordSize = 0;
2695 0 : cl_kernel kern = nullptr;
2696 : cl_channel_order chOrder;
2697 :
2698 0 : warper->useUnifiedSrcDensity = unifiedSrcDensity != nullptr;
2699 0 : warper->useUnifiedSrcValid = unifiedSrcValid != nullptr;
2700 :
2701 : // Check the word size
2702 0 : switch (warper->imageFormat)
2703 : {
2704 0 : case CL_FLOAT:
2705 0 : wordSize = sizeof(float);
2706 0 : break;
2707 0 : case CL_SNORM_INT8:
2708 0 : wordSize = sizeof(char);
2709 0 : break;
2710 0 : case CL_UNORM_INT8:
2711 0 : wordSize = sizeof(unsigned char);
2712 0 : break;
2713 0 : case CL_SNORM_INT16:
2714 0 : wordSize = sizeof(short);
2715 0 : break;
2716 0 : case CL_UNORM_INT16:
2717 0 : wordSize = sizeof(unsigned short);
2718 0 : break;
2719 : }
2720 :
2721 : // Compile the kernel; the invariants are being compiled into the code
2722 0 : if (!warper->useVec || warper->numBands % 4)
2723 : {
2724 0 : warper->kern1 =
2725 0 : get_kernel(warper, FALSE, dfXScale, dfYScale, dfXFilter, dfYFilter,
2726 : nXRadius, nYRadius, nFiltInitX, nFiltInitY, &err);
2727 0 : handleErr(err);
2728 : }
2729 0 : if (warper->useVec)
2730 : {
2731 0 : warper->kern4 =
2732 0 : get_kernel(warper, TRUE, dfXScale, dfYScale, dfXFilter, dfYFilter,
2733 : nXRadius, nYRadius, nFiltInitX, nFiltInitY, &err);
2734 0 : handleErr(err);
2735 : }
2736 :
2737 : // Copy coord data to the device
2738 0 : handleErr(err = set_coord_data(warper, &xy));
2739 :
2740 : // Copy unified density & valid data
2741 0 : handleErr(err = set_unified_data(warper, &unifiedSrcDensityCL,
2742 : &unifiedSrcValidCL, unifiedSrcDensity,
2743 : unifiedSrcValid, &useBandSrcValidCL,
2744 : &nBandSrcValidCL));
2745 :
2746 : // Copy output density & valid data
2747 0 : handleErr(set_dst_data(warper, &dstDensityCL, &dstValidCL, &dstNoDataRealCL,
2748 : dstDensity, dstValid, warper->fDstNoDataReal));
2749 :
2750 : // What's the recommended group size?
2751 0 : if (warper->useVec)
2752 : {
2753 : // Start with the vector kernel
2754 0 : handleErr(clGetKernelWorkGroupInfo(
2755 : warper->kern4, warper->dev, CL_KERNEL_WORK_GROUP_SIZE,
2756 : sizeof(size_t), &groupSize, nullptr));
2757 0 : kern = warper->kern4;
2758 0 : chSize = warper->imgChSize4;
2759 0 : chOrder = warper->imgChOrder4;
2760 : }
2761 : else
2762 : {
2763 : // We're only using the float kernel
2764 0 : handleErr(clGetKernelWorkGroupInfo(
2765 : warper->kern1, warper->dev, CL_KERNEL_WORK_GROUP_SIZE,
2766 : sizeof(size_t), &groupSize, nullptr));
2767 0 : kern = warper->kern1;
2768 0 : chSize = warper->imgChSize1;
2769 0 : chOrder = warper->imgChOrder1;
2770 : }
2771 :
2772 : // Loop over each image
2773 0 : for (i = 0; i < warper->numImages; ++i)
2774 : {
2775 : cl_mem srcImag, srcReal;
2776 : cl_mem dstReal, dstImag;
2777 0 : int bandNum = nextBandNum;
2778 :
2779 : // Switch kernels if needed
2780 0 : if (warper->useVec &&
2781 0 : nextBandNum < warper->numBands - warper->numBands % 4)
2782 : {
2783 0 : nextBandNum += 4;
2784 : }
2785 : else
2786 : {
2787 0 : if (kern == warper->kern4)
2788 : {
2789 0 : handleErr(clGetKernelWorkGroupInfo(
2790 : warper->kern1, warper->dev, CL_KERNEL_WORK_GROUP_SIZE,
2791 : sizeof(size_t), &groupSize, nullptr));
2792 0 : kern = warper->kern1;
2793 0 : chSize = warper->imgChSize1;
2794 0 : chOrder = warper->imgChOrder1;
2795 : }
2796 0 : ++nextBandNum;
2797 : }
2798 :
2799 : // Create & copy the source image
2800 0 : handleErr(err = set_src_rast_data(warper, i, chSize * wordSize, chOrder,
2801 : &srcReal, &srcImag));
2802 :
2803 : // Create & copy the output image
2804 0 : if (kern == warper->kern1)
2805 : {
2806 0 : handleErr(err = set_dst_rast_data(warper, i, wordSize, &dstReal,
2807 : &dstImag));
2808 : }
2809 : else
2810 : {
2811 0 : handleErr(err = set_dst_rast_data(warper, i, wordSize * 4, &dstReal,
2812 : &dstImag));
2813 : }
2814 :
2815 : // Set the bandNum
2816 0 : handleErr(err = clSetKernelArg(kern, 12, sizeof(int), &bandNum));
2817 :
2818 : // Run the kernel
2819 0 : handleErr(err = execute_kern(warper, kern, groupSize));
2820 :
2821 : // Free loop CL mem
2822 0 : handleErr(err = clReleaseMemObject(srcReal));
2823 0 : handleErr(err = clReleaseMemObject(srcImag));
2824 :
2825 : // Copy the back output results
2826 0 : if (kern == warper->kern1)
2827 : {
2828 0 : handleErr(
2829 : err = get_dst_rast_data(warper, i, wordSize, dstReal, dstImag));
2830 : }
2831 : else
2832 : {
2833 0 : handleErr(err = get_dst_rast_data(warper, i, wordSize * 4, dstReal,
2834 : dstImag));
2835 : }
2836 :
2837 : // Free remaining CL mem
2838 0 : handleErr(err = clReleaseMemObject(dstReal));
2839 0 : handleErr(err = clReleaseMemObject(dstImag));
2840 : }
2841 :
2842 : // Free remaining CL mem
2843 0 : handleErr(err = clReleaseMemObject(xy));
2844 0 : handleErr(err = clReleaseMemObject(unifiedSrcDensityCL));
2845 0 : handleErr(err = clReleaseMemObject(unifiedSrcValidCL));
2846 0 : handleErr(err = clReleaseMemObject(useBandSrcValidCL));
2847 0 : handleErr(err = clReleaseMemObject(nBandSrcValidCL));
2848 0 : handleErr(err = clReleaseMemObject(dstDensityCL));
2849 0 : handleErr(err = clReleaseMemObject(dstValidCL));
2850 0 : handleErr(err = clReleaseMemObject(dstNoDataRealCL));
2851 :
2852 0 : return CL_SUCCESS;
2853 : }
2854 :
2855 : /*
2856 : Sets pointers to the floating point data in the warper. The pointers
2857 : are internal to the warper structure, so don't free() them. If the imag
2858 : channel is in use, it will receive a pointer. Otherwise it'll be set to NULL.
2859 : These are pointers to floating point data, so the caller will need to
2860 : manipulate the output as appropriate before saving the data.
2861 :
2862 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2863 : */
2864 0 : cl_int GDALWarpKernelOpenCL_getRow(struct oclWarper *warper, void **rowReal,
2865 : void **rowImag, int rowNum, int bandNum)
2866 : {
2867 0 : int memOff = rowNum * warper->dstWidth;
2868 0 : int imgNum = bandNum;
2869 :
2870 0 : if (warper->useVec && bandNum < warper->numBands - warper->numBands % 4)
2871 : {
2872 0 : memOff += warper->dstWidth * warper->dstHeight * (bandNum % 4);
2873 0 : imgNum = bandNum / 4;
2874 : }
2875 0 : else if (warper->useVec)
2876 : {
2877 0 : imgNum = bandNum / 4 + bandNum % 4;
2878 : }
2879 :
2880 : // Return pointers into the warper's data
2881 0 : switch (warper->imageFormat)
2882 : {
2883 0 : case CL_FLOAT:
2884 0 : (*rowReal) = &(warper->dstRealWork.f[imgNum][memOff]);
2885 0 : break;
2886 0 : case CL_SNORM_INT8:
2887 0 : (*rowReal) = &(warper->dstRealWork.c[imgNum][memOff]);
2888 0 : break;
2889 0 : case CL_UNORM_INT8:
2890 0 : (*rowReal) = &(warper->dstRealWork.uc[imgNum][memOff]);
2891 0 : break;
2892 0 : case CL_SNORM_INT16:
2893 0 : (*rowReal) = &(warper->dstRealWork.s[imgNum][memOff]);
2894 0 : break;
2895 0 : case CL_UNORM_INT16:
2896 0 : (*rowReal) = &(warper->dstRealWork.us[imgNum][memOff]);
2897 0 : break;
2898 : }
2899 :
2900 0 : if (warper->dstImagWorkCL == nullptr)
2901 : {
2902 0 : (*rowImag) = nullptr;
2903 : }
2904 : else
2905 : {
2906 0 : switch (warper->imageFormat)
2907 : {
2908 0 : case CL_FLOAT:
2909 0 : (*rowImag) = &(warper->dstImagWork.f[imgNum][memOff]);
2910 0 : break;
2911 0 : case CL_SNORM_INT8:
2912 0 : (*rowImag) = &(warper->dstImagWork.c[imgNum][memOff]);
2913 0 : break;
2914 0 : case CL_UNORM_INT8:
2915 0 : (*rowImag) = &(warper->dstImagWork.uc[imgNum][memOff]);
2916 0 : break;
2917 0 : case CL_SNORM_INT16:
2918 0 : (*rowImag) = &(warper->dstImagWork.s[imgNum][memOff]);
2919 0 : break;
2920 0 : case CL_UNORM_INT16:
2921 0 : (*rowImag) = &(warper->dstImagWork.us[imgNum][memOff]);
2922 0 : break;
2923 : }
2924 : }
2925 :
2926 0 : return CL_SUCCESS;
2927 : }
2928 :
2929 : /*
2930 : Free the OpenCL warper environment. It should check everything for NULL, so
2931 : be sure to mark free()ed pointers as NULL or it'll be double free()ed.
2932 :
2933 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2934 : */
2935 0 : cl_int GDALWarpKernelOpenCL_deleteEnv(struct oclWarper *warper)
2936 : {
2937 : int i;
2938 0 : cl_int err = CL_SUCCESS;
2939 :
2940 0 : for (i = 0; i < warper->numImages; ++i)
2941 : {
2942 : // Run free!!
2943 0 : void *dummy = nullptr;
2944 0 : if (warper->realWork.v)
2945 0 : freeCLMem(warper->realWorkCL[i], warper->realWork.v[i]);
2946 : else
2947 0 : freeCLMem(warper->realWorkCL[i], dummy);
2948 0 : if (warper->realWork.v)
2949 0 : freeCLMem(warper->dstRealWorkCL[i], warper->dstRealWork.v[i]);
2950 : else
2951 0 : freeCLMem(warper->dstRealWorkCL[i], dummy);
2952 :
2953 : //(As applicable)
2954 0 : if (warper->imagWorkCL != nullptr && warper->imagWork.v != nullptr &&
2955 0 : warper->imagWork.v[i] != nullptr)
2956 : {
2957 0 : freeCLMem(warper->imagWorkCL[i], warper->imagWork.v[i]);
2958 : }
2959 0 : if (warper->dstImagWorkCL != nullptr &&
2960 0 : warper->dstImagWork.v != nullptr &&
2961 0 : warper->dstImagWork.v[i] != nullptr)
2962 : {
2963 0 : freeCLMem(warper->dstImagWorkCL[i], warper->dstImagWork.v[i]);
2964 : }
2965 : }
2966 :
2967 : // Free cl_mem
2968 0 : freeCLMem(warper->useBandSrcValidCL, warper->useBandSrcValid);
2969 0 : freeCLMem(warper->nBandSrcValidCL, warper->nBandSrcValid);
2970 0 : freeCLMem(warper->xyWorkCL, warper->xyWork);
2971 0 : freeCLMem(warper->fDstNoDataRealCL, warper->fDstNoDataReal);
2972 :
2973 : // Free pointers to cl_mem*
2974 0 : if (warper->realWorkCL != nullptr)
2975 0 : CPLFree(warper->realWorkCL);
2976 0 : if (warper->dstRealWorkCL != nullptr)
2977 0 : CPLFree(warper->dstRealWorkCL);
2978 :
2979 0 : if (warper->imagWorkCL != nullptr)
2980 0 : CPLFree(warper->imagWorkCL);
2981 0 : if (warper->dstImagWorkCL != nullptr)
2982 0 : CPLFree(warper->dstImagWorkCL);
2983 :
2984 0 : if (warper->realWork.v != nullptr)
2985 0 : CPLFree(warper->realWork.v);
2986 0 : if (warper->dstRealWork.v != nullptr)
2987 0 : CPLFree(warper->dstRealWork.v);
2988 :
2989 0 : if (warper->imagWork.v != nullptr)
2990 0 : CPLFree(warper->imagWork.v);
2991 0 : if (warper->dstImagWork.v != nullptr)
2992 0 : CPLFree(warper->dstImagWork.v);
2993 :
2994 : // Free OpenCL structures
2995 0 : if (warper->kern1 != nullptr)
2996 0 : clReleaseKernel(warper->kern1);
2997 0 : if (warper->kern4 != nullptr)
2998 0 : clReleaseKernel(warper->kern4);
2999 0 : if (warper->queue != nullptr)
3000 0 : clReleaseCommandQueue(warper->queue);
3001 0 : if (warper->context != nullptr)
3002 0 : clReleaseContext(warper->context);
3003 :
3004 0 : CPLFree(warper);
3005 :
3006 0 : return CL_SUCCESS;
3007 : }
3008 :
3009 : #endif /* defined(HAVE_OPENCL) */
|