Line data Source code
1 : /******************************************************************************
2 : * $Id$
3 : *
4 : * Project: OpenCL Image Reprojector
5 : * Purpose: Implementation of the GDALWarpKernel reprojector in OpenCL.
6 : * Author: Seth Price, seth@pricepages.org
7 : *
8 : ******************************************************************************
9 : * Copyright (c) 2010, Seth Price <seth@pricepages.org>
10 : * Copyright (c) 2010-2012, Even Rouault <even dot rouault at spatialys.com>
11 : *
12 : * SPDX-License-Identifier: MIT
13 : ****************************************************************************/
14 :
15 : #include <algorithm>
16 : #if defined(HAVE_OPENCL)
17 :
18 : /* The following line may be uncommented for increased debugging traces and
19 : * profiling */
20 : /* #define DEBUG_OPENCL 1 */
21 :
22 : #include <assert.h>
23 : #include <stdlib.h>
24 : #include <stdio.h>
25 : #include <limits.h>
26 : #include <float.h>
27 : #include <limits>
28 : #include <vector>
29 : #include "cpl_string.h"
30 : #include "gdalwarpkernel_opencl.h"
31 :
32 : #define handleErr(err) \
33 : do \
34 : { \
35 : if ((err) != CL_SUCCESS) \
36 : { \
37 : CPLError(CE_Failure, CPLE_AppDefined, \
38 : "Error at file %s line %d: %s", __FILE__, __LINE__, \
39 : getCLErrorString(err)); \
40 : return err; \
41 : } \
42 : } while (0)
43 :
44 : #define handleErrRetNULL(err) \
45 : do \
46 : { \
47 : if ((err) != CL_SUCCESS) \
48 : { \
49 : (*clErr) = err; \
50 : CPLError(CE_Failure, CPLE_AppDefined, \
51 : "Error at file %s line %d: %s", __FILE__, __LINE__, \
52 : getCLErrorString(err)); \
53 : return nullptr; \
54 : } \
55 : } while (0)
56 :
57 : #define handleErrGoto(err, goto_label) \
58 : do \
59 : { \
60 : if ((err) != CL_SUCCESS) \
61 : { \
62 : (*clErr) = err; \
63 : CPLError(CE_Failure, CPLE_AppDefined, \
64 : "Error at file %s line %d: %s", __FILE__, __LINE__, \
65 : getCLErrorString(err)); \
66 : goto goto_label; \
67 : } \
68 : } while (0)
69 :
70 : #define freeCLMem(clMem, fallBackMem) \
71 : do \
72 : { \
73 : if ((clMem) != nullptr) \
74 : { \
75 : handleErr(err = clReleaseMemObject(clMem)); \
76 : clMem = nullptr; \
77 : fallBackMem = nullptr; \
78 : } \
79 : else if ((fallBackMem) != nullptr) \
80 : { \
81 : CPLFree(fallBackMem); \
82 : fallBackMem = nullptr; \
83 : } \
84 : } while (false)
85 :
86 0 : static const char *getCLErrorString(cl_int err)
87 : {
88 0 : switch (err)
89 : {
90 0 : case CL_SUCCESS:
91 0 : return ("CL_SUCCESS");
92 : break;
93 0 : case CL_DEVICE_NOT_FOUND:
94 0 : return ("CL_DEVICE_NOT_FOUND");
95 : break;
96 0 : case CL_DEVICE_NOT_AVAILABLE:
97 0 : return ("CL_DEVICE_NOT_AVAILABLE");
98 : break;
99 0 : case CL_COMPILER_NOT_AVAILABLE:
100 0 : return ("CL_COMPILER_NOT_AVAILABLE");
101 : break;
102 0 : case CL_MEM_OBJECT_ALLOCATION_FAILURE:
103 0 : return ("CL_MEM_OBJECT_ALLOCATION_FAILURE");
104 : break;
105 0 : case CL_OUT_OF_RESOURCES:
106 0 : return ("CL_OUT_OF_RESOURCES");
107 : break;
108 0 : case CL_OUT_OF_HOST_MEMORY:
109 0 : return ("CL_OUT_OF_HOST_MEMORY");
110 : break;
111 0 : case CL_PROFILING_INFO_NOT_AVAILABLE:
112 0 : return ("CL_PROFILING_INFO_NOT_AVAILABLE");
113 : break;
114 0 : case CL_MEM_COPY_OVERLAP:
115 0 : return ("CL_MEM_COPY_OVERLAP");
116 : break;
117 0 : case CL_IMAGE_FORMAT_MISMATCH:
118 0 : return ("CL_IMAGE_FORMAT_MISMATCH");
119 : break;
120 0 : case CL_IMAGE_FORMAT_NOT_SUPPORTED:
121 0 : return ("CL_IMAGE_FORMAT_NOT_SUPPORTED");
122 : break;
123 0 : case CL_BUILD_PROGRAM_FAILURE:
124 0 : return ("CL_BUILD_PROGRAM_FAILURE");
125 : break;
126 0 : case CL_MAP_FAILURE:
127 0 : return ("CL_MAP_FAILURE");
128 : break;
129 0 : case CL_INVALID_VALUE:
130 0 : return ("CL_INVALID_VALUE");
131 : break;
132 0 : case CL_INVALID_DEVICE_TYPE:
133 0 : return ("CL_INVALID_DEVICE_TYPE");
134 : break;
135 0 : case CL_INVALID_PLATFORM:
136 0 : return ("CL_INVALID_PLATFORM");
137 : break;
138 0 : case CL_INVALID_DEVICE:
139 0 : return ("CL_INVALID_DEVICE");
140 : break;
141 0 : case CL_INVALID_CONTEXT:
142 0 : return ("CL_INVALID_CONTEXT");
143 : break;
144 0 : case CL_INVALID_QUEUE_PROPERTIES:
145 0 : return ("CL_INVALID_QUEUE_PROPERTIES");
146 : break;
147 0 : case CL_INVALID_COMMAND_QUEUE:
148 0 : return ("CL_INVALID_COMMAND_QUEUE");
149 : break;
150 0 : case CL_INVALID_HOST_PTR:
151 0 : return ("CL_INVALID_HOST_PTR");
152 : break;
153 0 : case CL_INVALID_MEM_OBJECT:
154 0 : return ("CL_INVALID_MEM_OBJECT");
155 : break;
156 0 : case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
157 0 : return ("CL_INVALID_IMAGE_FORMAT_DESCRIPTOR");
158 : break;
159 0 : case CL_INVALID_IMAGE_SIZE:
160 0 : return ("CL_INVALID_IMAGE_SIZE");
161 : break;
162 0 : case CL_INVALID_SAMPLER:
163 0 : return ("CL_INVALID_SAMPLER");
164 : break;
165 0 : case CL_INVALID_BINARY:
166 0 : return ("CL_INVALID_BINARY");
167 : break;
168 0 : case CL_INVALID_BUILD_OPTIONS:
169 0 : return ("CL_INVALID_BUILD_OPTIONS");
170 : break;
171 0 : case CL_INVALID_PROGRAM:
172 0 : return ("CL_INVALID_PROGRAM");
173 : break;
174 0 : case CL_INVALID_PROGRAM_EXECUTABLE:
175 0 : return ("CL_INVALID_PROGRAM_EXECUTABLE");
176 : break;
177 0 : case CL_INVALID_KERNEL_NAME:
178 0 : return ("CL_INVALID_KERNEL_NAME");
179 : break;
180 0 : case CL_INVALID_KERNEL_DEFINITION:
181 0 : return ("CL_INVALID_KERNEL_DEFINITION");
182 : break;
183 0 : case CL_INVALID_KERNEL:
184 0 : return ("CL_INVALID_KERNEL");
185 : break;
186 0 : case CL_INVALID_ARG_INDEX:
187 0 : return ("CL_INVALID_ARG_INDEX");
188 : break;
189 0 : case CL_INVALID_ARG_VALUE:
190 0 : return ("CL_INVALID_ARG_VALUE");
191 : break;
192 0 : case CL_INVALID_ARG_SIZE:
193 0 : return ("CL_INVALID_ARG_SIZE");
194 : break;
195 0 : case CL_INVALID_KERNEL_ARGS:
196 0 : return ("CL_INVALID_KERNEL_ARGS");
197 : break;
198 0 : case CL_INVALID_WORK_DIMENSION:
199 0 : return ("CL_INVALID_WORK_DIMENSION");
200 : break;
201 0 : case CL_INVALID_WORK_GROUP_SIZE:
202 0 : return ("CL_INVALID_WORK_GROUP_SIZE");
203 : break;
204 0 : case CL_INVALID_WORK_ITEM_SIZE:
205 0 : return ("CL_INVALID_WORK_ITEM_SIZE");
206 : break;
207 0 : case CL_INVALID_GLOBAL_OFFSET:
208 0 : return ("CL_INVALID_GLOBAL_OFFSET");
209 : break;
210 0 : case CL_INVALID_EVENT_WAIT_LIST:
211 0 : return ("CL_INVALID_EVENT_WAIT_LIST");
212 : break;
213 0 : case CL_INVALID_EVENT:
214 0 : return ("CL_INVALID_EVENT");
215 : break;
216 0 : case CL_INVALID_OPERATION:
217 0 : return ("CL_INVALID_OPERATION");
218 : break;
219 0 : case CL_INVALID_GL_OBJECT:
220 0 : return ("CL_INVALID_GL_OBJECT");
221 : break;
222 0 : case CL_INVALID_BUFFER_SIZE:
223 0 : return ("CL_INVALID_BUFFER_SIZE");
224 : break;
225 0 : case CL_INVALID_MIP_LEVEL:
226 0 : return ("CL_INVALID_MIP_LEVEL");
227 : break;
228 0 : case CL_INVALID_GLOBAL_WORK_SIZE:
229 0 : return ("CL_INVALID_GLOBAL_WORK_SIZE");
230 : break;
231 : }
232 :
233 0 : return "unknown_error";
234 : }
235 :
236 0 : static const char *getCLDataTypeString(cl_channel_type dataType)
237 : {
238 0 : switch (dataType)
239 : {
240 0 : case CL_SNORM_INT8:
241 0 : return "CL_SNORM_INT8";
242 0 : case CL_SNORM_INT16:
243 0 : return "CL_SNORM_INT16";
244 0 : case CL_UNORM_INT8:
245 0 : return "CL_UNORM_INT8";
246 0 : case CL_UNORM_INT16:
247 0 : return "CL_UNORM_INT16";
248 : #if 0
249 : case CL_UNORM_SHORT_565: return "CL_UNORM_SHORT_565";
250 : case CL_UNORM_SHORT_555: return "CL_UNORM_SHORT_555";
251 : case CL_UNORM_INT_101010: return "CL_UNORM_INT_101010";
252 : case CL_SIGNED_INT8: return "CL_SIGNED_INT8";
253 : case CL_SIGNED_INT16: return "CL_SIGNED_INT16";
254 : case CL_SIGNED_INT32: return "CL_SIGNED_INT32";
255 : case CL_UNSIGNED_INT8: return "CL_UNSIGNED_INT8";
256 : case CL_UNSIGNED_INT16: return "CL_UNSIGNED_INT16";
257 : case CL_UNSIGNED_INT32: return "CL_UNSIGNED_INT32";
258 : case CL_HALF_FLOAT: return "CL_HALF_FLOAT";
259 : #endif
260 0 : case CL_FLOAT:
261 0 : return "CL_FLOAT";
262 0 : default:
263 0 : return "unknown";
264 : }
265 : }
266 :
267 : /*
268 : Finds an appropriate OpenCL device. For debugging, it's
269 : always easier to use CL_DEVICE_TYPE_CPU because then */
270 : /*ok*/ /*printf() can be called
271 : from the kernel. If debugging is on, we can print the name and stats about the
272 : device we're using.
273 : */
274 0 : static cl_device_id get_device(OCLVendor *peVendor)
275 : {
276 0 : cl_int err = 0;
277 0 : size_t returned_size = 0;
278 0 : cl_char vendor_name[1024] = {0};
279 0 : cl_char device_name[1024] = {0};
280 : cl_device_type device_type;
281 :
282 0 : std::vector<cl_platform_id> platforms;
283 0 : cl_uint num_platforms = 0;
284 :
285 : static bool gbBuggyOpenCL = false;
286 0 : if (gbBuggyOpenCL)
287 0 : return nullptr;
288 : try
289 : {
290 0 : err = clGetPlatformIDs(0, nullptr, &num_platforms);
291 0 : if (err != CL_SUCCESS || num_platforms == 0)
292 0 : return nullptr;
293 :
294 0 : platforms.resize(num_platforms);
295 0 : err = clGetPlatformIDs(num_platforms, &platforms[0], nullptr);
296 0 : if (err != CL_SUCCESS)
297 0 : return nullptr;
298 : }
299 0 : catch (...)
300 : {
301 0 : gbBuggyOpenCL = true;
302 0 : CPLDebug("OpenCL", "clGetPlatformIDs() threw a C++ exception");
303 : // This should normally not happen. But that does happen with
304 : // intel-opencl 0r2.0-54426 when run under xvfb-run
305 0 : return nullptr;
306 : }
307 :
308 : const bool bUseOpenCLCPU =
309 0 : CPLTestBool(CPLGetConfigOption("OPENCL_USE_CPU", "FALSE"));
310 :
311 : struct device_info
312 : {
313 : cl_device_id id;
314 : cl_device_type device_type;
315 : std::string vendor_name;
316 : std::string device_name;
317 : };
318 :
319 0 : std::vector<device_info> openCLdevices;
320 :
321 : // In case we have several implementations, pick up the non Intel one by
322 : // default, unless the PREFERRED_OPENCL_VENDOR config option is specified.
323 0 : if (num_platforms > 0)
324 : {
325 : const char *pszBlacklistedVendor =
326 0 : CPLGetConfigOption("BLACKLISTED_OPENCL_VENDOR", nullptr);
327 :
328 0 : for (cl_uint platformIndex = 0; platformIndex < num_platforms;
329 : platformIndex++)
330 : {
331 :
332 0 : constexpr int max_devices{20};
333 : cl_uint num_devices;
334 : cl_device_id devices[max_devices];
335 0 : err = clGetDeviceIDs(platforms[platformIndex],
336 : bUseOpenCLCPU
337 : ? CL_DEVICE_TYPE_CPU
338 : : CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU,
339 : max_devices, devices, &num_devices);
340 :
341 0 : if (err != CL_SUCCESS)
342 : {
343 0 : continue;
344 : }
345 :
346 0 : cl_device_id device = nullptr;
347 :
348 0 : for (cl_uint deviceIndex = 0; deviceIndex < num_devices;
349 : ++deviceIndex)
350 : {
351 :
352 0 : device = devices[deviceIndex];
353 0 : err = clGetDeviceInfo(device, CL_DEVICE_VENDOR,
354 : sizeof(vendor_name), vendor_name,
355 : &returned_size);
356 0 : err |=
357 0 : clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name),
358 : device_name, &returned_size);
359 0 : err |=
360 0 : clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(device_type),
361 : &device_type, &returned_size);
362 0 : assert(err == CL_SUCCESS);
363 0 : if (pszBlacklistedVendor &&
364 0 : EQUAL(reinterpret_cast<const char *>(vendor_name),
365 : pszBlacklistedVendor))
366 : {
367 0 : CPLDebug("OpenCL",
368 : "Blacklisted vendor='%s' / device='%s' "
369 : "implementation skipped",
370 : vendor_name, device_name);
371 : }
372 : else
373 : {
374 0 : CPLDebug(
375 : "OpenCL",
376 : "Found vendor='%s' / device='%s' (%s implementation).",
377 : vendor_name, device_name,
378 0 : (device_type == CL_DEVICE_TYPE_GPU) ? "GPU" : "CPU");
379 0 : openCLdevices.push_back(
380 : {device, device_type,
381 : reinterpret_cast<const char *>(vendor_name),
382 : reinterpret_cast<const char *>(device_name)});
383 : }
384 : }
385 : }
386 : }
387 :
388 0 : if (!openCLdevices.empty())
389 : {
390 : // Sort, GPU first, then preferred vendor, Intel comes last (unless is the preferred of course)
391 : const std::string preferredVendorName{
392 0 : CPLGetConfigOption("PREFERRED_OPENCL_VENDOR", "")};
393 0 : std::sort(
394 : openCLdevices.begin(), openCLdevices.end(),
395 0 : [&](const device_info first, const device_info second) -> int
396 : {
397 0 : return (first.device_type == CL_DEVICE_TYPE_GPU ? 4 : 0) +
398 0 : (first.vendor_name == preferredVendorName ? 2 : 0) +
399 0 : (first.vendor_name.find("Intel") == 0 ? 0 : 1) >
400 0 : (second.device_type == CL_DEVICE_TYPE_GPU ? 4 : 0) +
401 0 : (second.vendor_name == preferredVendorName ? 2 : 0) +
402 0 : (second.vendor_name.find("Intel") == 0 ? 0 : 1);
403 : });
404 : }
405 : else
406 : {
407 0 : CPLDebug("OpenCL", "No implementation found");
408 0 : return nullptr;
409 : }
410 :
411 0 : const device_info &device{openCLdevices.front()};
412 :
413 0 : CPLDebug("OpenCL",
414 : "Connected to vendor='%s' / device='%s' (%s implementation).",
415 : device.vendor_name.c_str(), device.device_name.c_str(),
416 0 : (device.device_type == CL_DEVICE_TYPE_GPU) ? "GPU" : "CPU");
417 :
418 0 : if (STARTS_WITH(device.vendor_name.c_str(), "Advanced Micro Devices"))
419 0 : *peVendor = VENDOR_AMD;
420 0 : else if (STARTS_WITH(device.vendor_name.c_str(), "Intel"))
421 0 : *peVendor = VENDOR_INTEL;
422 : else
423 0 : *peVendor = VENDOR_OTHER;
424 :
425 0 : return device.id;
426 : }
427 :
428 : /*
429 : Given that not all OpenCL devices support the same image formats, we need to
430 : make do with what we have. This leads to wasted space, but as OpenCL matures
431 : I hope it'll get better.
432 : */
433 0 : static cl_int set_supported_formats(struct oclWarper *warper,
434 : cl_channel_order minOrderSize,
435 : cl_channel_order *chosenOrder,
436 : unsigned int *chosenSize,
437 : cl_channel_type dataType)
438 : {
439 : cl_image_format *fmtBuf =
440 0 : static_cast<cl_image_format *>(calloc(256, sizeof(cl_image_format)));
441 : cl_uint numRet;
442 : cl_uint i;
443 0 : cl_uint extraSpace = 9999;
444 : cl_int err;
445 0 : int bFound = FALSE;
446 :
447 : // Find what we *can* handle
448 0 : handleErr(err = clGetSupportedImageFormats(
449 : warper->context, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE2D, 256,
450 : fmtBuf, &numRet));
451 0 : for (i = 0; i < numRet; ++i)
452 : {
453 0 : cl_channel_order thisOrderSize = 0;
454 0 : switch (fmtBuf[i].image_channel_order)
455 : {
456 : // Only support formats which use the channels in order
457 : // (x,y,z,w)
458 0 : case CL_R:
459 : case CL_INTENSITY:
460 : case CL_LUMINANCE:
461 0 : thisOrderSize = 1;
462 0 : break;
463 0 : case CL_RG:
464 0 : thisOrderSize = 2;
465 0 : break;
466 0 : case CL_RGB:
467 0 : thisOrderSize = 3;
468 0 : break;
469 0 : case CL_RGBA:
470 0 : thisOrderSize = 4;
471 0 : break;
472 : }
473 :
474 : // Choose an order with the least wasted space
475 0 : if (fmtBuf[i].image_channel_data_type == dataType &&
476 0 : minOrderSize <= thisOrderSize &&
477 0 : extraSpace > thisOrderSize - minOrderSize)
478 : {
479 :
480 : // Set the vector size, order, & remember wasted space
481 0 : (*chosenSize) = thisOrderSize;
482 0 : (*chosenOrder) = fmtBuf[i].image_channel_order;
483 0 : extraSpace = thisOrderSize - minOrderSize;
484 0 : bFound = TRUE;
485 : }
486 : }
487 :
488 0 : free(fmtBuf);
489 :
490 0 : if (!bFound)
491 : {
492 0 : CPLDebug("OpenCL",
493 : "Cannot find supported format for dataType = %s and "
494 : "minOrderSize = %d",
495 : getCLDataTypeString(dataType), static_cast<int>(minOrderSize));
496 : }
497 0 : return (bFound) ? CL_SUCCESS : CL_INVALID_OPERATION;
498 : }
499 :
500 : /*
501 : Allocate some pinned memory that we can use as an intermediate buffer. We're
502 : using the pinned memory to assemble the data before transferring it to the
503 : device. The reason we're using pinned RAM is because the transfer speed from
504 : host RAM to device RAM is faster than non-pinned. The disadvantage is that
505 : pinned RAM is a scarce OS resource. I'm making the assumption that the user
506 : has as much pinned host RAM available as total device RAM because device RAM
507 : tends to be similarly scarce. However, if the pinned memory fails we fall back
508 : to using a regular memory allocation.
509 :
510 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
511 : */
512 0 : static cl_int alloc_pinned_mem(struct oclWarper *warper, int imgNum,
513 : size_t dataSz, void **wrkPtr, cl_mem *wrkCL)
514 : {
515 0 : cl_int err = CL_SUCCESS;
516 0 : wrkCL[imgNum] = clCreateBuffer(warper->context,
517 : CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
518 : dataSz, nullptr, &err);
519 :
520 0 : if (err == CL_SUCCESS)
521 : {
522 0 : wrkPtr[imgNum] = clEnqueueMapBuffer(warper->queue, wrkCL[imgNum],
523 : CL_FALSE, CL_MAP_WRITE, 0, dataSz,
524 : 0, nullptr, nullptr, &err);
525 0 : handleErr(err);
526 : }
527 : else
528 : {
529 0 : wrkCL[imgNum] = nullptr;
530 : #ifdef DEBUG_OPENCL
531 : CPLDebug("OpenCL", "Using fallback non-pinned memory!");
532 : #endif
533 : // Fallback to regular allocation
534 0 : wrkPtr[imgNum] = VSI_MALLOC_VERBOSE(dataSz);
535 :
536 0 : if (wrkPtr[imgNum] == nullptr)
537 0 : handleErr(err = CL_OUT_OF_HOST_MEMORY);
538 : }
539 :
540 0 : return CL_SUCCESS;
541 : }
542 :
543 : /*
544 : Allocates the working host memory for all bands of the image in the warper
545 : structure. This includes both the source image buffers and the destination
546 : buffers. This memory is located on the host, so we can assemble the image.
547 : Reasons for buffering it like this include reading each row from disk and
548 : de-interleaving bands and parts of bands. Then they can be copied to the device
549 : as a single operation fit for use as an OpenCL memory object.
550 :
551 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
552 : */
553 0 : static cl_int alloc_working_arr(struct oclWarper *warper, size_t ptrSz,
554 : size_t dataSz, CPL_UNUSED size_t *fmtSz)
555 : {
556 0 : cl_int err = CL_SUCCESS;
557 : int i, b;
558 : size_t srcDataSz1, dstDataSz1, srcDataSz4, dstDataSz4;
559 0 : const int numBands = warper->numBands;
560 :
561 : // Find the best channel order for this format
562 0 : err = set_supported_formats(warper, 1, &(warper->imgChOrder1),
563 : &(warper->imgChSize1), warper->imageFormat);
564 0 : handleErr(err);
565 0 : if (warper->useVec)
566 : {
567 0 : err = set_supported_formats(warper, 4, &(warper->imgChOrder4),
568 : &(warper->imgChSize4), warper->imageFormat);
569 0 : handleErr(err);
570 : }
571 :
572 : // Alloc space for pointers to the main image data
573 0 : warper->realWork.v =
574 0 : static_cast<void **>(VSI_CALLOC_VERBOSE(ptrSz, warper->numImages));
575 0 : warper->dstRealWork.v =
576 0 : static_cast<void **>(VSI_CALLOC_VERBOSE(ptrSz, warper->numImages));
577 0 : if (warper->realWork.v == nullptr || warper->dstRealWork.v == nullptr)
578 0 : handleErr(err = CL_OUT_OF_HOST_MEMORY);
579 :
580 0 : if (warper->imagWorkCL != nullptr)
581 : {
582 : // Alloc space for pointers to the extra channel, if it exists
583 0 : warper->imagWork.v =
584 0 : static_cast<void **>(VSI_CALLOC_VERBOSE(ptrSz, warper->numImages));
585 0 : warper->dstImagWork.v =
586 0 : static_cast<void **>(VSI_CALLOC_VERBOSE(ptrSz, warper->numImages));
587 0 : if (warper->imagWork.v == nullptr || warper->dstImagWork.v == nullptr)
588 0 : handleErr(err = CL_OUT_OF_HOST_MEMORY);
589 : }
590 : else
591 : {
592 0 : warper->imagWork.v = nullptr;
593 0 : warper->dstImagWork.v = nullptr;
594 : }
595 :
596 : // Calc the sizes we need
597 0 : srcDataSz1 =
598 0 : dataSz * warper->srcWidth * warper->srcHeight * warper->imgChSize1;
599 0 : dstDataSz1 = dataSz * warper->dstWidth * warper->dstHeight;
600 0 : srcDataSz4 =
601 0 : dataSz * warper->srcWidth * warper->srcHeight * warper->imgChSize4;
602 0 : dstDataSz4 = dataSz * warper->dstWidth * warper->dstHeight * 4;
603 :
604 : // Allocate pinned memory for each band's image
605 0 : for (b = 0, i = 0; b < numBands && i < warper->numImages; ++i)
606 : {
607 0 : if (warper->useVec && b < numBands - numBands % 4)
608 : {
609 0 : handleErr(err = alloc_pinned_mem(warper, i, srcDataSz4,
610 : warper->realWork.v,
611 : warper->realWorkCL));
612 :
613 0 : handleErr(err = alloc_pinned_mem(warper, i, dstDataSz4,
614 : warper->dstRealWork.v,
615 : warper->dstRealWorkCL));
616 0 : b += 4;
617 : }
618 : else
619 : {
620 0 : handleErr(err = alloc_pinned_mem(warper, i, srcDataSz1,
621 : warper->realWork.v,
622 : warper->realWorkCL));
623 :
624 0 : handleErr(err = alloc_pinned_mem(warper, i, dstDataSz1,
625 : warper->dstRealWork.v,
626 : warper->dstRealWorkCL));
627 0 : ++b;
628 : }
629 : }
630 :
631 0 : if (warper->imagWorkCL != nullptr)
632 : {
633 : // Allocate pinned memory for each band's extra channel, if exists
634 0 : for (b = 0, i = 0; b < numBands && i < warper->numImages; ++i)
635 : {
636 0 : if (warper->useVec && b < numBands - numBands % 4)
637 : {
638 0 : handleErr(err = alloc_pinned_mem(warper, i, srcDataSz4,
639 : warper->imagWork.v,
640 : warper->imagWorkCL));
641 :
642 0 : handleErr(err = alloc_pinned_mem(warper, i, dstDataSz4,
643 : warper->dstImagWork.v,
644 : warper->dstImagWorkCL));
645 0 : b += 4;
646 : }
647 : else
648 : {
649 0 : handleErr(err = alloc_pinned_mem(warper, i, srcDataSz1,
650 : warper->imagWork.v,
651 : warper->imagWorkCL));
652 :
653 0 : handleErr(err = alloc_pinned_mem(warper, i, dstDataSz1,
654 : warper->dstImagWork.v,
655 : warper->dstImagWorkCL));
656 0 : ++b;
657 : }
658 : }
659 : }
660 :
661 0 : return CL_SUCCESS;
662 : }
663 :
664 : /*
665 : Assemble and create the kernel. For optimization, portability, and
666 : implementation limitation reasons, the program is actually assembled from
667 : several strings, then compiled with as many invariants as possible defined by
668 : the preprocessor. There is also quite a bit of error-catching code in here
669 : because the kernel is where many bugs show up.
670 :
671 : Returns CL_SUCCESS on success and other CL_* errors in the error buffer when
672 : something goes wrong.
673 : */
674 0 : static cl_kernel get_kernel(struct oclWarper *warper, char useVec,
675 : double dfXScale, double dfYScale, double dfXFilter,
676 : double dfYFilter, int nXRadius, int nYRadius,
677 : int nFiltInitX, int nFiltInitY, cl_int *clErr)
678 : {
679 : cl_program program;
680 : cl_kernel kernel;
681 0 : cl_int err = CL_SUCCESS;
682 0 : constexpr int PROGBUF_SIZE = 128000;
683 0 : std::string buffer;
684 0 : buffer.resize(PROGBUF_SIZE);
685 0 : std::string progBuf;
686 0 : progBuf.resize(PROGBUF_SIZE);
687 0 : float dstMinVal = 0.f, dstMaxVal = 0.0;
688 :
689 0 : const char *outType = "";
690 0 : const char *dUseVec = "";
691 0 : const char *dVecf = "float";
692 0 : const char *kernGenFuncs = R""""(
693 : // ********************* General Funcs ********************
694 : void clampToDst(float fReal,
695 : __global outType *dstPtr,
696 : unsigned int iDstOffset,
697 : __constant float *fDstNoDataReal,
698 : int bandNum);
699 : void setPixel(__global outType *dstReal,
700 : __global outType *dstImag,
701 : __global float *dstDensity,
702 : __global int *nDstValid,
703 : __constant float *fDstNoDataReal,
704 : const int bandNum,
705 : vecf fDensity, vecf fReal, vecf fImag);
706 : int getPixel(__read_only image2d_t srcReal,
707 : __read_only image2d_t srcImag,
708 : __global float *fUnifiedSrcDensity,
709 : __global int *nUnifiedSrcValid,
710 : __constant char *useBandSrcValid,
711 : __global int *nBandSrcValid,
712 : const int2 iSrc,
713 : int bandNum,
714 : vecf *fDensity, vecf *fReal, vecf *fImag);
715 : int isValid(__global float *fUnifiedSrcDensity,
716 : __global int *nUnifiedSrcValid,
717 : float2 fSrcCoords );
718 : float2 getSrcCoords(__read_only image2d_t srcCoords);
719 :
720 : #ifdef USE_CLAMP_TO_DST_FLOAT
721 : void clampToDst(float fReal,
722 : __global outType *dstPtr,
723 : unsigned int iDstOffset,
724 : __constant float *fDstNoDataReal,
725 : int bandNum)
726 : {
727 : dstPtr[iDstOffset] = fReal;
728 : }
729 : #else
730 : void clampToDst(float fReal,
731 : __global outType *dstPtr,
732 : unsigned int iDstOffset,
733 : __constant float *fDstNoDataReal,
734 : int bandNum)
735 : {
736 : fReal *= dstMaxVal;
737 :
738 : if (fReal < dstMinVal)
739 : dstPtr[iDstOffset] = (outType)dstMinVal;
740 : else if (fReal > dstMaxVal)
741 : dstPtr[iDstOffset] = (outType)dstMaxVal;
742 : else
743 : dstPtr[iDstOffset] = (dstMinVal < 0) ? (outType)floor(fReal + 0.5f) : (outType)(fReal + 0.5f);
744 :
745 : if (useDstNoDataReal && bandNum >= 0 &&
746 : fDstNoDataReal[bandNum] == dstPtr[iDstOffset])
747 : {
748 : if (dstPtr[iDstOffset] == dstMinVal)
749 : dstPtr[iDstOffset] = dstMinVal + 1;
750 : else
751 : dstPtr[iDstOffset] = dstPtr[iDstOffset] - 1;
752 : }
753 : }
754 : #endif
755 :
756 : void setPixel(__global outType *dstReal,
757 : __global outType *dstImag,
758 : __global float *dstDensity,
759 : __global int *nDstValid,
760 : __constant float *fDstNoDataReal,
761 : const int bandNum,
762 : vecf fDensity, vecf fReal, vecf fImag)
763 : {
764 : unsigned int iDstOffset = get_global_id(1)*iDstWidth + get_global_id(0);
765 :
766 : #ifdef USE_VEC
767 : if (fDensity.x < 0.00001f && fDensity.y < 0.00001f &&
768 : fDensity.z < 0.00001f && fDensity.w < 0.00001f ) {
769 :
770 : fReal = 0.0f;
771 : fImag = 0.0f;
772 :
773 : } else if (fDensity.x < 0.9999f || fDensity.y < 0.9999f ||
774 : fDensity.z < 0.9999f || fDensity.w < 0.9999f ) {
775 : vecf fDstReal, fDstImag;
776 : float fDstDensity;
777 :
778 : fDstReal.x = dstReal[iDstOffset];
779 : fDstReal.y = dstReal[iDstOffset+iDstHeight*iDstWidth];
780 : fDstReal.z = dstReal[iDstOffset+iDstHeight*iDstWidth*2];
781 : fDstReal.w = dstReal[iDstOffset+iDstHeight*iDstWidth*3];
782 : if (useImag) {
783 : fDstImag.x = dstImag[iDstOffset];
784 : fDstImag.y = dstImag[iDstOffset+iDstHeight*iDstWidth];
785 : fDstImag.z = dstImag[iDstOffset+iDstHeight*iDstWidth*2];
786 : fDstImag.w = dstImag[iDstOffset+iDstHeight*iDstWidth*3];
787 : }
788 : #else
789 : if (fDensity < 0.00001f) {
790 :
791 : fReal = 0.0f;
792 : fImag = 0.0f;
793 :
794 : } else if (fDensity < 0.9999f) {
795 : vecf fDstReal, fDstImag;
796 : float fDstDensity;
797 :
798 : fDstReal = dstReal[iDstOffset];
799 : if (useImag)
800 : fDstImag = dstImag[iDstOffset];
801 : #endif
802 :
803 : if (useDstDensity)
804 : fDstDensity = dstDensity[iDstOffset];
805 : else if (useDstValid &&
806 : !((nDstValid[iDstOffset>>5] & (0x01 << (iDstOffset & 0x1f))) ))
807 : fDstDensity = 0.0f;
808 : else
809 : fDstDensity = 1.0f;
810 :
811 : vecf fDstInfluence = (1.0f - fDensity) * fDstDensity;
812 :
813 : // Density should be checked for <= 0.0 & handled by the calling function
814 : fReal = (fReal * fDensity + fDstReal * fDstInfluence) / (fDensity + fDstInfluence);
815 : if (useImag)
816 : fImag = (fImag * fDensity + fDstImag * fDstInfluence) / (fDensity + fDstInfluence);
817 : }
818 :
819 : #ifdef USE_VEC
820 : clampToDst(fReal.x, dstReal, iDstOffset, fDstNoDataReal, bandNum);
821 : clampToDst(fReal.y, dstReal, iDstOffset+iDstHeight*iDstWidth, fDstNoDataReal, bandNum);
822 : clampToDst(fReal.z, dstReal, iDstOffset+iDstHeight*iDstWidth*2, fDstNoDataReal, bandNum);
823 : clampToDst(fReal.w, dstReal, iDstOffset+iDstHeight*iDstWidth*3, fDstNoDataReal, bandNum);
824 : if (useImag) {
825 : clampToDst(fImag.x, dstImag, iDstOffset, fDstNoDataReal, -1);
826 : clampToDst(fImag.y, dstImag, iDstOffset+iDstHeight*iDstWidth, fDstNoDataReal, -1);
827 : clampToDst(fImag.z, dstImag, iDstOffset+iDstHeight*iDstWidth*2, fDstNoDataReal, -1);
828 : clampToDst(fImag.w, dstImag, iDstOffset+iDstHeight*iDstWidth*3, fDstNoDataReal, -1);
829 : }
830 : #else
831 : clampToDst(fReal, dstReal, iDstOffset, fDstNoDataReal, bandNum);
832 : if (useImag)
833 : clampToDst(fImag, dstImag, iDstOffset, fDstNoDataReal, -1);
834 : #endif
835 : }
836 :
837 : int getPixel(__read_only image2d_t srcReal,
838 : __read_only image2d_t srcImag,
839 : __global float *fUnifiedSrcDensity,
840 : __global int *nUnifiedSrcValid,
841 : __constant char *useBandSrcValid,
842 : __global int *nBandSrcValid,
843 : const int2 iSrc,
844 : int bandNum,
845 : vecf *fDensity, vecf *fReal, vecf *fImag)
846 : {
847 : int iSrcOffset = 0, iBandValidLen = 0, iSrcOffsetMask = 0;
848 : int bHasValid = FALSE;
849 :
850 : // Clamp the src offset values if needed
851 : if(useUnifiedSrcDensity | useUnifiedSrcValid | useUseBandSrcValid){
852 : int iSrcX = iSrc.x;
853 : int iSrcY = iSrc.y;
854 :
855 : // Needed because the offset isn't clamped in OpenCL hardware
856 : if(iSrcX < 0)
857 : iSrcX = 0;
858 : else if(iSrcX >= iSrcWidth)
859 : iSrcX = iSrcWidth - 1;
860 :
861 : if(iSrcY < 0)
862 : iSrcY = 0;
863 : else if(iSrcY >= iSrcHeight)
864 : iSrcY = iSrcHeight - 1;
865 :
866 : iSrcOffset = iSrcY*iSrcWidth + iSrcX;
867 : iBandValidLen = 1 + ((iSrcWidth*iSrcHeight)>>5);
868 : iSrcOffsetMask = (0x01 << (iSrcOffset & 0x1f));
869 : }
870 :
871 : if (useUnifiedSrcValid &&
872 : !((nUnifiedSrcValid[iSrcOffset>>5] & iSrcOffsetMask) ) )
873 : return FALSE;
874 :
875 : #ifdef USE_VEC
876 : if (!useUseBandSrcValid || !useBandSrcValid[bandNum] ||
877 : ((nBandSrcValid[(iSrcOffset>>5)+iBandValidLen*bandNum ] & iSrcOffsetMask)) )
878 : bHasValid = TRUE;
879 :
880 : if (!useUseBandSrcValid || !useBandSrcValid[bandNum+1] ||
881 : ((nBandSrcValid[(iSrcOffset>>5)+iBandValidLen*(1+bandNum)] & iSrcOffsetMask)) )
882 : bHasValid = TRUE;
883 :
884 : if (!useUseBandSrcValid || !useBandSrcValid[bandNum+2] ||
885 : ((nBandSrcValid[(iSrcOffset>>5)+iBandValidLen*(2+bandNum)] & iSrcOffsetMask)) )
886 : bHasValid = TRUE;
887 :
888 : if (!useUseBandSrcValid || !useBandSrcValid[bandNum+3] ||
889 : ((nBandSrcValid[(iSrcOffset>>5)+iBandValidLen*(3+bandNum)] & iSrcOffsetMask)) )
890 : bHasValid = TRUE;
891 : #else
892 : if (!useUseBandSrcValid || !useBandSrcValid[bandNum] ||
893 : ((nBandSrcValid[(iSrcOffset>>5)+iBandValidLen*bandNum ] & iSrcOffsetMask)) )
894 : bHasValid = TRUE;
895 : #endif
896 :
897 : if (!bHasValid)
898 : return FALSE;
899 :
900 : const sampler_t samp = CLK_NORMALIZED_COORDS_FALSE |
901 : CLK_ADDRESS_CLAMP_TO_EDGE |
902 : CLK_FILTER_NEAREST;
903 :
904 : #ifdef USE_VEC
905 : (*fReal) = read_imagef(srcReal, samp, iSrc);
906 : if (useImag)
907 : (*fImag) = read_imagef(srcImag, samp, iSrc);
908 : #else
909 : (*fReal) = read_imagef(srcReal, samp, iSrc).x;
910 : if (useImag)
911 : (*fImag) = read_imagef(srcImag, samp, iSrc).x;
912 : #endif
913 :
914 : if (useUnifiedSrcDensity) {
915 : (*fDensity) = fUnifiedSrcDensity[iSrcOffset];
916 : } else {
917 : (*fDensity) = 1.0f;
918 : return TRUE;
919 : }
920 :
921 : #ifdef USE_VEC
922 : return (*fDensity).x > 0.0000001f || (*fDensity).y > 0.0000001f ||
923 : (*fDensity).z > 0.0000001f || (*fDensity).w > 0.0000001f;
924 : #else
925 : return (*fDensity) > 0.0000001f;
926 : #endif
927 : }
928 :
929 : int isValid(__global float *fUnifiedSrcDensity,
930 : __global int *nUnifiedSrcValid,
931 : float2 fSrcCoords )
932 : {
933 : if (fSrcCoords.x < 0.0f || fSrcCoords.y < 0.0f)
934 : return FALSE;
935 :
936 : int iSrcX = (int) (fSrcCoords.x - 0.5f);
937 : int iSrcY = (int) (fSrcCoords.y - 0.5f);
938 :
939 : if( iSrcX < 0 || iSrcX >= iSrcWidth || iSrcY < 0 || iSrcY >= iSrcHeight )
940 : return FALSE;
941 :
942 : int iSrcOffset = iSrcX + iSrcY * iSrcWidth;
943 :
944 : if (useUnifiedSrcDensity && fUnifiedSrcDensity[iSrcOffset] < 0.00001f)
945 : return FALSE;
946 :
947 : if (useUnifiedSrcValid &&
948 : !(nUnifiedSrcValid[iSrcOffset>>5] & (0x01 << (iSrcOffset & 0x1f))) )
949 : return FALSE;
950 :
951 : return TRUE;
952 : }
953 :
954 : float2 getSrcCoords(__read_only image2d_t srcCoords)
955 : {
956 : // Find an appropriate place to sample the coordinates so we're still
957 : // accurate after linear interpolation.
958 : int nDstX = get_global_id(0);
959 : int nDstY = get_global_id(1);
960 : float2 fDst = (float2)((0.5f * (float)iCoordMult + nDstX) /
961 : (float)((ceil((iDstWidth - 1) / (float)iCoordMult) + 1) * iCoordMult),
962 : (0.5f * (float)iCoordMult + nDstY) /
963 : (float)((ceil((iDstHeight - 1) / (float)iCoordMult) + 1) * iCoordMult));
964 :
965 : // Check & return when the thread group overruns the image size
966 : if (nDstX >= iDstWidth || nDstY >= iDstHeight)
967 : return (float2)(-99.0f, -99.0f);
968 :
969 : const sampler_t samp = CLK_NORMALIZED_COORDS_TRUE |
970 : CLK_ADDRESS_CLAMP_TO_EDGE |
971 : CLK_FILTER_LINEAR;
972 :
973 : float4 fSrcCoords = read_imagef(srcCoords,samp,fDst);
974 :
975 : return (float2)(fSrcCoords.x, fSrcCoords.y);
976 : }
977 : )"""";
978 :
979 0 : const char *kernBilinear = R""""(
980 : // ************************ Bilinear ************************
981 : __kernel void resamp(__read_only image2d_t srcCoords,
982 : __read_only image2d_t srcReal,
983 : __read_only image2d_t srcImag,
984 : __global float *fUnifiedSrcDensity,
985 : __global int *nUnifiedSrcValid,
986 : __constant char *useBandSrcValid,
987 : __global int *nBandSrcValid,
988 : __global outType *dstReal,
989 : __global outType *dstImag,
990 : __constant float *fDstNoDataReal,
991 : __global float *dstDensity,
992 : __global int *nDstValid,
993 : const int bandNum)
994 : {
995 : float2 fSrc = getSrcCoords(srcCoords);
996 : if (!isValid(fUnifiedSrcDensity, nUnifiedSrcValid, fSrc))
997 : return;
998 :
999 : int iSrcX = (int) floor(fSrc.x - 0.5f);
1000 : int iSrcY = (int) floor(fSrc.y - 0.5f);
1001 : float fRatioX = 1.5f - (fSrc.x - iSrcX);
1002 : float fRatioY = 1.5f - (fSrc.y - iSrcY);
1003 : vecf fReal, fImag, fDens;
1004 : vecf fAccumulatorReal = 0.0f, fAccumulatorImag = 0.0f;
1005 : vecf fAccumulatorDensity = 0.0f;
1006 : float fAccumulatorDivisor = 0.0f;
1007 :
1008 : if ( iSrcY >= 0 && iSrcY < iSrcHeight ) {
1009 : float fMult1 = fRatioX * fRatioY;
1010 : float fMult2 = (1.0f-fRatioX) * fRatioY;
1011 :
1012 : // Upper Left Pixel
1013 : if ( iSrcX >= 0 && iSrcX < iSrcWidth
1014 : && getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1015 : useBandSrcValid, nBandSrcValid, (int2)(iSrcX, iSrcY),
1016 : bandNum, &fDens, &fReal, &fImag))
1017 : {
1018 : fAccumulatorDivisor += fMult1;
1019 : fAccumulatorReal += fReal * fMult1;
1020 : fAccumulatorImag += fImag * fMult1;
1021 : fAccumulatorDensity += fDens * fMult1;
1022 : }
1023 :
1024 : // Upper Right Pixel
1025 : if ( iSrcX+1 >= 0 && iSrcX+1 < iSrcWidth
1026 : && getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1027 : useBandSrcValid, nBandSrcValid, (int2)(iSrcX+1, iSrcY),
1028 : bandNum, &fDens, &fReal, &fImag))
1029 : {
1030 : fAccumulatorDivisor += fMult2;
1031 : fAccumulatorReal += fReal * fMult2;
1032 : fAccumulatorImag += fImag * fMult2;
1033 : fAccumulatorDensity += fDens * fMult2;
1034 : }
1035 : }
1036 :
1037 : if ( iSrcY+1 >= 0 && iSrcY+1 < iSrcHeight ) {
1038 : float fMult1 = fRatioX * (1.0f-fRatioY);
1039 : float fMult2 = (1.0f-fRatioX) * (1.0f-fRatioY);
1040 :
1041 : // Lower Left Pixel
1042 : if ( iSrcX >= 0 && iSrcX < iSrcWidth
1043 : && getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1044 : useBandSrcValid, nBandSrcValid, (int2)(iSrcX, iSrcY+1),
1045 : bandNum, &fDens, &fReal, &fImag))
1046 : {
1047 : fAccumulatorDivisor += fMult1;
1048 : fAccumulatorReal += fReal * fMult1;
1049 : fAccumulatorImag += fImag * fMult1;
1050 : fAccumulatorDensity += fDens * fMult1;
1051 : }
1052 :
1053 : // Lower Right Pixel
1054 : if ( iSrcX+1 >= 0 && iSrcX+1 < iSrcWidth
1055 : && getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1056 : useBandSrcValid, nBandSrcValid, (int2)(iSrcX+1, iSrcY+1),
1057 : bandNum, &fDens, &fReal, &fImag))
1058 : {
1059 : fAccumulatorDivisor += fMult2;
1060 : fAccumulatorReal += fReal * fMult2;
1061 : fAccumulatorImag += fImag * fMult2;
1062 : fAccumulatorDensity += fDens * fMult2;
1063 : }
1064 : }
1065 :
1066 : // Compute and save final pixel
1067 : if ( fAccumulatorDivisor < 0.00001f ) {
1068 : setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
1069 : 0.0f, 0.0f, 0.0f );
1070 : } else if ( fAccumulatorDivisor < 0.99999f || fAccumulatorDivisor > 1.00001f ) {
1071 : setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
1072 : fAccumulatorDensity / fAccumulatorDivisor,
1073 : fAccumulatorReal / fAccumulatorDivisor,
1074 : #if useImag != 0
1075 : fAccumulatorImag / fAccumulatorDivisor );
1076 : #else
1077 : 0.0f );
1078 : #endif
1079 : } else {
1080 : setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
1081 : fAccumulatorDensity, fAccumulatorReal, fAccumulatorImag );
1082 : }
1083 : }
1084 : )"""";
1085 :
1086 0 : const char *kernCubic = R""""(
1087 : // ************************ Cubic ************************
1088 : vecf cubicConvolution(float dist1, float dist2, float dist3,
1089 : vecf f0, vecf f1, vecf f2, vecf f3);
1090 :
1091 : vecf cubicConvolution(float dist1, float dist2, float dist3,
1092 : vecf f0, vecf f1, vecf f2, vecf f3)
1093 : {
1094 : return ( f1
1095 : + 0.5f * (dist1*(f2 - f0)
1096 : + dist2*(2.0f*f0 - 5.0f*f1 + 4.0f*f2 - f3)
1097 : + dist3*(3.0f*(f1 - f2) + f3 - f0)));
1098 : }
1099 :
1100 : // ************************ Cubic ************************
1101 : __kernel void resamp(__read_only image2d_t srcCoords,
1102 : __read_only image2d_t srcReal,
1103 : __read_only image2d_t srcImag,
1104 : __global float *fUnifiedSrcDensity,
1105 : __global int *nUnifiedSrcValid,
1106 : __constant char *useBandSrcValid,
1107 : __global int *nBandSrcValid,
1108 : __global outType *dstReal,
1109 : __global outType *dstImag,
1110 : __constant float *fDstNoDataReal,
1111 : __global float *dstDensity,
1112 : __global int *nDstValid,
1113 : const int bandNum)
1114 : {
1115 : int i;
1116 : float2 fSrc = getSrcCoords(srcCoords);
1117 :
1118 : if (!isValid(fUnifiedSrcDensity, nUnifiedSrcValid, fSrc))
1119 : return;
1120 :
1121 : int iSrcX = (int) floor( fSrc.x - 0.5f );
1122 : int iSrcY = (int) floor( fSrc.y - 0.5f );
1123 : float fDeltaX = fSrc.x - 0.5f - (float)iSrcX;
1124 : float fDeltaY = fSrc.y - 0.5f - (float)iSrcY;
1125 : float fDeltaX2 = fDeltaX * fDeltaX;
1126 : float fDeltaY2 = fDeltaY * fDeltaY;
1127 : float fDeltaX3 = fDeltaX2 * fDeltaX;
1128 : float fDeltaY3 = fDeltaY2 * fDeltaY;
1129 : vecf afReal[4], afDens[4];
1130 : #if useImag != 0
1131 : vecf afImag[4];
1132 : #else
1133 : vecf fImag = 0.0f;
1134 : #endif
1135 :
1136 : // Loop over rows
1137 : for (i = -1; i < 3; ++i)
1138 : {
1139 : vecf fReal1 = 0.0f, fReal2 = 0.0f, fReal3 = 0.0f, fReal4 = 0.0f;
1140 : vecf fDens1 = 0.0f, fDens2 = 0.0f, fDens3 = 0.0f, fDens4 = 0.0f;
1141 : int hasPx;
1142 : #if useImag != 0
1143 : vecf fImag1 = 0.0f, fImag2 = 0.0f, fImag3 = 0.0f, fImag4 = 0.0f;
1144 :
1145 : //Get all the pixels for this row
1146 : hasPx = getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1147 : useBandSrcValid, nBandSrcValid, (int2)(iSrcX-1, iSrcY+i),
1148 : bandNum, &fDens1, &fReal1, &fImag1);
1149 :
1150 : hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1151 : useBandSrcValid, nBandSrcValid, (int2)(iSrcX , iSrcY+i),
1152 : bandNum, &fDens2, &fReal2, &fImag2);
1153 :
1154 : hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1155 : useBandSrcValid, nBandSrcValid, (int2)(iSrcX+1, iSrcY+i),
1156 : bandNum, &fDens3, &fReal3, &fImag3);
1157 :
1158 : hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1159 : useBandSrcValid, nBandSrcValid, (int2)(iSrcX+2, iSrcY+i),
1160 : bandNum, &fDens4, &fReal4, &fImag4);
1161 : #else
1162 : //Get all the pixels for this row
1163 : hasPx = getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1164 : useBandSrcValid, nBandSrcValid, (int2)(iSrcX-1, iSrcY+i),
1165 : bandNum, &fDens1, &fReal1, &fImag);
1166 :
1167 : hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1168 : useBandSrcValid, nBandSrcValid, (int2)(iSrcX , iSrcY+i),
1169 : bandNum, &fDens2, &fReal2, &fImag);
1170 :
1171 : hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1172 : useBandSrcValid, nBandSrcValid, (int2)(iSrcX+1, iSrcY+i),
1173 : bandNum, &fDens3, &fReal3, &fImag);
1174 :
1175 : hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1176 : useBandSrcValid, nBandSrcValid, (int2)(iSrcX+2, iSrcY+i),
1177 : bandNum, &fDens4, &fReal4, &fImag);
1178 : #endif
1179 :
1180 : // Shortcut if no px
1181 : if (!hasPx) {
1182 : afDens[i+1] = 0.0f;
1183 : afReal[i+1] = 0.0f;
1184 : #if useImag != 0
1185 : afImag[i+1] = 0.0f;
1186 : #endif
1187 : continue;
1188 : }
1189 :
1190 : // Process this row
1191 : afDens[i+1] = cubicConvolution(fDeltaX, fDeltaX2, fDeltaX3, fDens1, fDens2, fDens3, fDens4);
1192 : afReal[i+1] = cubicConvolution(fDeltaX, fDeltaX2, fDeltaX3, fReal1, fReal2, fReal3, fReal4);
1193 : #if useImag != 0
1194 : afImag[i+1] = cubicConvolution(fDeltaX, fDeltaX2, fDeltaX3, fImag1, fImag2, fImag3, fImag4);
1195 : #endif
1196 : }
1197 :
1198 : // Compute and save final pixel
1199 : setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
1200 : cubicConvolution(fDeltaY, fDeltaY2, fDeltaY3, afDens[0], afDens[1], afDens[2], afDens[3]),
1201 : cubicConvolution(fDeltaY, fDeltaY2, fDeltaY3, afReal[0], afReal[1], afReal[2], afReal[3]),
1202 : #if useImag != 0
1203 : cubicConvolution(fDeltaY, fDeltaY2, fDeltaY3, afImag[0], afImag[1], afImag[2], afImag[3]) );
1204 : #else
1205 : fImag );
1206 : #endif
1207 : }
1208 : )"""";
1209 :
1210 0 : const char *kernResampler = R""""(
1211 : // ************************ LanczosSinc ************************
1212 :
1213 : float lanczosSinc( float fX, float fR );
1214 : float bSpline( float x );
1215 :
1216 : float lanczosSinc( float fX, float fR )
1217 : {
1218 : if ( fX > fR || fX < -fR)
1219 : return 0.0f;
1220 : if ( fX == 0.0f )
1221 : return 1.0f;
1222 :
1223 : float fPIX = PI * fX;
1224 : return ( sin(fPIX) / fPIX ) * ( sin(fPIX / fR) * fR / fPIX );
1225 : }
1226 :
1227 : // ************************ Bicubic Spline ************************
1228 :
1229 : float bSpline( float x )
1230 : {
1231 : float xp2 = x + 2.0f;
1232 : float xp1 = x + 1.0f;
1233 : float xm1 = x - 1.0f;
1234 : float xp2c = xp2 * xp2 * xp2;
1235 :
1236 : return (((xp2 > 0.0f)?((xp1 > 0.0f)?((x > 0.0f)?((xm1 > 0.0f)?
1237 : -4.0f * xm1*xm1*xm1:0.0f) +
1238 : 6.0f * x*x*x:0.0f) +
1239 : -4.0f * xp1*xp1*xp1:0.0f) +
1240 : xp2c:0.0f) ) * 0.166666666666666666666f;
1241 : }
1242 :
1243 : // ************************ General Resampler ************************
1244 :
1245 : __kernel void resamp(__read_only image2d_t srcCoords,
1246 : __read_only image2d_t srcReal,
1247 : __read_only image2d_t srcImag,
1248 : __global float *fUnifiedSrcDensity,
1249 : __global int *nUnifiedSrcValid,
1250 : __constant char *useBandSrcValid,
1251 : __global int *nBandSrcValid,
1252 : __global outType *dstReal,
1253 : __global outType *dstImag,
1254 : __constant float *fDstNoDataReal,
1255 : __global float *dstDensity,
1256 : __global int *nDstValid,
1257 : const int bandNum)
1258 : {
1259 : float2 fSrc = getSrcCoords(srcCoords);
1260 :
1261 : if (!isValid(fUnifiedSrcDensity, nUnifiedSrcValid, fSrc))
1262 : return;
1263 :
1264 : int iSrcX = floor( fSrc.x - 0.5f );
1265 : int iSrcY = floor( fSrc.y - 0.5f );
1266 : float fDeltaX = fSrc.x - 0.5f - (float)iSrcX;
1267 : float fDeltaY = fSrc.y - 0.5f - (float)iSrcY;
1268 :
1269 : vecf fAccumulatorReal = 0.0f, fAccumulatorImag = 0.0f;
1270 : vecf fAccumulatorDensity = 0.0f;
1271 : float fAccumulatorWeight = 0.0f;
1272 : int i, j;
1273 :
1274 : // Loop over pixel rows in the kernel
1275 : for ( j = nFiltInitY; j <= nYRadius; ++j )
1276 : {
1277 : float fWeight1;
1278 : int2 iSrc = (int2)(0, iSrcY + j);
1279 :
1280 : // Skip sampling over edge of image
1281 : if ( iSrc.y < 0 || iSrc.y >= iSrcHeight )
1282 : continue;
1283 :
1284 : // Select the resampling algorithm
1285 : if ( doCubicSpline )
1286 : // Calculate the Y weight
1287 : fWeight1 = ( fYScale < 1.0f ) ?
1288 : bSpline(((float)j) * fYScale) * fYScale :
1289 : bSpline(((float)j) - fDeltaY);
1290 : else
1291 : fWeight1 = ( fYScale < 1.0f ) ?
1292 : lanczosSinc(j * fYScale, fYFilter) * fYScale :
1293 : lanczosSinc(j - fDeltaY, fYFilter);
1294 :
1295 : // Iterate over pixels in row
1296 : for ( i = nFiltInitX; i <= nXRadius; ++i )
1297 : {
1298 : float fWeight2;
1299 : vecf fDensity = 0.0f, fReal, fImag;
1300 : iSrc.x = iSrcX + i;
1301 :
1302 : // Skip sampling at edge of image
1303 : // Skip sampling when invalid pixel
1304 : if ( iSrc.x < 0 || iSrc.x >= iSrcWidth ||
1305 : !getPixel(srcReal, srcImag, fUnifiedSrcDensity,
1306 : nUnifiedSrcValid, useBandSrcValid, nBandSrcValid,
1307 : iSrc, bandNum, &fDensity, &fReal, &fImag) )
1308 : continue;
1309 :
1310 : // Choose among possible algorithms
1311 : if ( doCubicSpline )
1312 : // Calculate & save the X weight
1313 : fWeight2 = fWeight1 * ((fXScale < 1.0f ) ?
1314 : bSpline((float)i * fXScale) * fXScale :
1315 : bSpline(fDeltaX - (float)i));
1316 : else
1317 : // Calculate & save the X weight
1318 : fWeight2 = fWeight1 * ((fXScale < 1.0f ) ?
1319 : lanczosSinc(i * fXScale, fXFilter) * fXScale :
1320 : lanczosSinc(i - fDeltaX, fXFilter));
1321 :
1322 : // Accumulate!
1323 : fAccumulatorReal += fReal * fWeight2;
1324 : fAccumulatorImag += fImag * fWeight2;
1325 : fAccumulatorDensity += fDensity * fWeight2;
1326 : fAccumulatorWeight += fWeight2;
1327 : }
1328 : }
1329 :
1330 : if ( fAccumulatorWeight < 0.000001f ) {
1331 : setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
1332 : 0.0f, 0.0f, 0.0f);
1333 : } else if ( fAccumulatorWeight < 0.99999f || fAccumulatorWeight > 1.00001f ) {
1334 : // Calculate the output taking into account weighting
1335 : setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
1336 : fAccumulatorDensity / fAccumulatorWeight,
1337 : fAccumulatorReal / fAccumulatorWeight,
1338 : #if useImag != 0
1339 : fAccumulatorImag / fAccumulatorWeight );
1340 : #else
1341 : 0.0f );
1342 : #endif
1343 : } else {
1344 : setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
1345 : fAccumulatorDensity, fAccumulatorReal, fAccumulatorImag);
1346 : }
1347 : }
1348 : )"""";
1349 :
1350 : // Defines based on image format
1351 0 : switch (warper->imageFormat)
1352 : {
1353 0 : case CL_FLOAT:
1354 0 : dstMinVal = std::numeric_limits<float>::lowest();
1355 0 : dstMaxVal = std::numeric_limits<float>::max();
1356 0 : outType = "float";
1357 0 : break;
1358 0 : case CL_SNORM_INT8:
1359 0 : dstMinVal = -128.0;
1360 0 : dstMaxVal = 127.0;
1361 0 : outType = "char";
1362 0 : break;
1363 0 : case CL_UNORM_INT8:
1364 0 : dstMinVal = 0.0;
1365 0 : dstMaxVal = 255.0;
1366 0 : outType = "uchar";
1367 0 : break;
1368 0 : case CL_SNORM_INT16:
1369 0 : dstMinVal = -32768.0;
1370 0 : dstMaxVal = 32767.0;
1371 0 : outType = "short";
1372 0 : break;
1373 0 : case CL_UNORM_INT16:
1374 0 : dstMinVal = 0.0;
1375 0 : dstMaxVal = 65535.0;
1376 0 : outType = "ushort";
1377 0 : break;
1378 0 : default:
1379 0 : CPLError(CE_Failure, CPLE_AppDefined, "Unhandled imageFormat = %d",
1380 : warper->imageFormat);
1381 0 : return nullptr;
1382 : }
1383 :
1384 : // Use vector format?
1385 0 : if (useVec)
1386 : {
1387 0 : dUseVec = "-D USE_VEC";
1388 0 : dVecf = "float4";
1389 : }
1390 :
1391 : // Assemble the kernel from parts. The compiler is unable to handle multiple
1392 : // kernels in one string with more than a few __constant modifiers each.
1393 0 : if (warper->resampAlg == OCL_Bilinear)
1394 0 : snprintf(&progBuf[0], PROGBUF_SIZE, "%s\n%s", kernGenFuncs,
1395 : kernBilinear);
1396 0 : else if (warper->resampAlg == OCL_Cubic)
1397 0 : snprintf(&progBuf[0], PROGBUF_SIZE, "%s\n%s", kernGenFuncs, kernCubic);
1398 : else
1399 0 : snprintf(&progBuf[0], PROGBUF_SIZE, "%s\n%s", kernGenFuncs,
1400 : kernResampler);
1401 :
1402 : // Actually make the program from assembled source
1403 0 : const char *pszProgBuf = progBuf.c_str();
1404 0 : program = clCreateProgramWithSource(warper->context, 1, &pszProgBuf,
1405 : nullptr, &err);
1406 0 : handleErrGoto(err, error_final);
1407 :
1408 : // Assemble the compiler arg string for speed. All invariants should be
1409 : // defined here.
1410 0 : snprintf(
1411 0 : &buffer[0], PROGBUF_SIZE,
1412 : "-cl-fast-relaxed-math -Werror -D FALSE=0 -D TRUE=1 "
1413 : "%s"
1414 : "-D iSrcWidth=%d -D iSrcHeight=%d -D iDstWidth=%d -D iDstHeight=%d "
1415 : "-D useUnifiedSrcDensity=%d -D useUnifiedSrcValid=%d "
1416 : "-D useDstDensity=%d -D useDstValid=%d -D useImag=%d "
1417 : "-D fXScale=%015.15lff -D fYScale=%015.15lff -D fXFilter=%015.15lff -D "
1418 : "fYFilter=%015.15lff "
1419 : "-D nXRadius=%d -D nYRadius=%d -D nFiltInitX=%d -D nFiltInitY=%d "
1420 : "-D PI=%015.15lff -D outType=%s -D dstMinVal=%015.15lff -D "
1421 : "dstMaxVal=%015.15lff "
1422 : "-D useDstNoDataReal=%d -D vecf=%s %s -D doCubicSpline=%d "
1423 : "-D useUseBandSrcValid=%d -D iCoordMult=%d ",
1424 0 : (warper->imageFormat == CL_FLOAT) ? "-D USE_CLAMP_TO_DST_FLOAT=1 " : "",
1425 : warper->srcWidth, warper->srcHeight, warper->dstWidth,
1426 : warper->dstHeight, warper->useUnifiedSrcDensity,
1427 : warper->useUnifiedSrcValid, warper->useDstDensity, warper->useDstValid,
1428 0 : warper->imagWorkCL != nullptr, dfXScale, dfYScale, dfXFilter, dfYFilter,
1429 : nXRadius, nYRadius, nFiltInitX, nFiltInitY, M_PI, outType, dstMinVal,
1430 0 : dstMaxVal, warper->fDstNoDataRealCL != nullptr, dVecf, dUseVec,
1431 0 : warper->resampAlg == OCL_CubicSpline,
1432 0 : warper->nBandSrcValidCL != nullptr, warper->coordMult);
1433 :
1434 0 : (*clErr) = err = clBuildProgram(program, 1, &(warper->dev), buffer.data(),
1435 : nullptr, nullptr);
1436 :
1437 : // Detailed debugging info
1438 0 : if (err != CL_SUCCESS)
1439 : {
1440 0 : const char *pszStatus = "unknown_status";
1441 0 : err = clGetProgramBuildInfo(program, warper->dev, CL_PROGRAM_BUILD_LOG,
1442 0 : PROGBUF_SIZE, &buffer[0], nullptr);
1443 0 : handleErrGoto(err, error_free_program);
1444 :
1445 0 : CPLError(CE_Failure, CPLE_AppDefined,
1446 : "Error: Failed to build program executable!\nBuild Log:\n%s",
1447 : buffer.c_str());
1448 :
1449 0 : err =
1450 0 : clGetProgramBuildInfo(program, warper->dev, CL_PROGRAM_BUILD_STATUS,
1451 0 : PROGBUF_SIZE, &buffer[0], nullptr);
1452 0 : handleErrGoto(err, error_free_program);
1453 :
1454 0 : if (buffer[0] == CL_BUILD_NONE)
1455 0 : pszStatus = "CL_BUILD_NONE";
1456 0 : else if (buffer[0] == CL_BUILD_ERROR)
1457 0 : pszStatus = "CL_BUILD_ERROR";
1458 0 : else if (buffer[0] == CL_BUILD_SUCCESS)
1459 0 : pszStatus = "CL_BUILD_SUCCESS";
1460 0 : else if (buffer[0] == CL_BUILD_IN_PROGRESS)
1461 0 : pszStatus = "CL_BUILD_IN_PROGRESS";
1462 :
1463 0 : CPLDebug("OpenCL", "Build Status: %s\nProgram Source:\n%s", pszStatus,
1464 : progBuf.c_str());
1465 0 : goto error_free_program;
1466 : }
1467 :
1468 0 : kernel = clCreateKernel(program, "resamp", &err);
1469 0 : handleErrGoto(err, error_free_program);
1470 :
1471 0 : err = clReleaseProgram(program);
1472 0 : handleErrGoto(err, error_final);
1473 :
1474 0 : return kernel;
1475 :
1476 0 : error_free_program:
1477 0 : err = clReleaseProgram(program);
1478 :
1479 0 : error_final:
1480 0 : return nullptr;
1481 : }
1482 :
1483 : /*
1484 : Alloc & copy the coordinate data from host working memory to the device. The
1485 : working memory should be a pinned, linear, array of floats. This allows us to
1486 : allocate and copy all data in one step. The pointer to the device memory is
1487 : saved and set as the appropriate argument number.
1488 :
1489 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1490 : */
1491 0 : static cl_int set_coord_data(struct oclWarper *warper, cl_mem *xy)
1492 : {
1493 0 : cl_int err = CL_SUCCESS;
1494 : cl_image_format imgFmt;
1495 :
1496 : // Copy coord data to the device
1497 0 : imgFmt.image_channel_order = warper->xyChOrder;
1498 0 : imgFmt.image_channel_data_type = CL_FLOAT;
1499 :
1500 : #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
1501 : #pragma GCC diagnostic push
1502 : #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
1503 : #endif
1504 0 : (*xy) = clCreateImage2D(warper->context,
1505 : CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &imgFmt,
1506 0 : static_cast<size_t>(warper->xyWidth),
1507 0 : static_cast<size_t>(warper->xyHeight),
1508 0 : sizeof(float) * warper->xyChSize * warper->xyWidth,
1509 0 : warper->xyWork, &err);
1510 : #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
1511 : #pragma GCC diagnostic pop
1512 : #endif
1513 0 : handleErr(err);
1514 :
1515 : // Free the source memory, now that it's copied we don't need it
1516 0 : freeCLMem(warper->xyWorkCL, warper->xyWork);
1517 :
1518 : // Set up argument
1519 0 : if (warper->kern1 != nullptr)
1520 : {
1521 0 : handleErr(err = clSetKernelArg(warper->kern1, 0, sizeof(cl_mem), xy));
1522 : }
1523 0 : if (warper->kern4 != nullptr)
1524 : {
1525 0 : handleErr(err = clSetKernelArg(warper->kern4, 0, sizeof(cl_mem), xy));
1526 : }
1527 :
1528 0 : return CL_SUCCESS;
1529 : }
1530 :
1531 : /*
1532 : Sets the unified density & valid data structures. These are optional structures
1533 : from GDAL, and as such if they are NULL a small placeholder memory segment is
1534 : defined. This is because the spec is unclear on if a NULL value can be passed
1535 : as a kernel argument in place of memory. If it's not NULL, the data is copied
1536 : from the working memory to the device memory. After that, we check if we are
1537 : using the per-band validity mask, and set that as appropriate. At the end, the
1538 : CL mem is passed as the kernel arguments.
1539 :
1540 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1541 : */
1542 : static cl_int
1543 0 : set_unified_data(struct oclWarper *warper, cl_mem *unifiedSrcDensityCL,
1544 : cl_mem *unifiedSrcValidCL, float *unifiedSrcDensity,
1545 : unsigned int *unifiedSrcValid, cl_mem *useBandSrcValidCL,
1546 : cl_mem *nBandSrcValidCL)
1547 : {
1548 0 : cl_int err = CL_SUCCESS;
1549 0 : size_t sz = warper->srcWidth * warper->srcHeight;
1550 0 : int useValid = warper->nBandSrcValidCL != nullptr;
1551 : // 32 bits in the mask
1552 0 : int validSz = static_cast<int>(sizeof(int) * ((31 + sz) >> 5));
1553 :
1554 : // Copy unifiedSrcDensity if it exists
1555 0 : if (unifiedSrcDensity == nullptr)
1556 : {
1557 : // Alloc dummy device RAM
1558 0 : (*unifiedSrcDensityCL) =
1559 0 : clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
1560 0 : handleErr(err);
1561 : }
1562 : else
1563 : {
1564 : // Alloc & copy all density data
1565 0 : (*unifiedSrcDensityCL) = clCreateBuffer(
1566 : warper->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
1567 : sizeof(float) * sz, unifiedSrcDensity, &err);
1568 0 : handleErr(err);
1569 : }
1570 :
1571 : // Copy unifiedSrcValid if it exists
1572 0 : if (unifiedSrcValid == nullptr)
1573 : {
1574 : // Alloc dummy device RAM
1575 0 : (*unifiedSrcValidCL) =
1576 0 : clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
1577 0 : handleErr(err);
1578 : }
1579 : else
1580 : {
1581 : // Alloc & copy all validity data
1582 0 : (*unifiedSrcValidCL) = clCreateBuffer(
1583 : warper->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, validSz,
1584 : unifiedSrcValid, &err);
1585 0 : handleErr(err);
1586 : }
1587 :
1588 : // Set the band validity usage
1589 0 : if (useValid)
1590 : {
1591 0 : (*useBandSrcValidCL) = clCreateBuffer(
1592 : warper->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
1593 0 : sizeof(char) * warper->numBands, warper->useBandSrcValid, &err);
1594 0 : handleErr(err);
1595 : }
1596 : else
1597 : {
1598 : // Make a fake image so we don't have a NULL pointer
1599 0 : (*useBandSrcValidCL) =
1600 0 : clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
1601 0 : handleErr(err);
1602 : }
1603 :
1604 : // Do a more thorough check for validity
1605 0 : if (useValid)
1606 : {
1607 : int i;
1608 0 : useValid = FALSE;
1609 0 : for (i = 0; i < warper->numBands; ++i)
1610 0 : if (warper->useBandSrcValid[i])
1611 0 : useValid = TRUE;
1612 : }
1613 :
1614 : // And the validity mask if needed
1615 0 : if (useValid)
1616 : {
1617 0 : (*nBandSrcValidCL) = clCreateBuffer(
1618 : warper->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
1619 0 : warper->numBands * validSz, warper->nBandSrcValid, &err);
1620 0 : handleErr(err);
1621 : }
1622 : else
1623 : {
1624 : // Make a fake image so we don't have a NULL pointer
1625 0 : (*nBandSrcValidCL) =
1626 0 : clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
1627 0 : handleErr(err);
1628 : }
1629 :
1630 : // Set up arguments
1631 0 : if (warper->kern1 != nullptr)
1632 : {
1633 0 : handleErr(err = clSetKernelArg(warper->kern1, 3, sizeof(cl_mem),
1634 : unifiedSrcDensityCL));
1635 0 : handleErr(err = clSetKernelArg(warper->kern1, 4, sizeof(cl_mem),
1636 : unifiedSrcValidCL));
1637 0 : handleErr(err = clSetKernelArg(warper->kern1, 5, sizeof(cl_mem),
1638 : useBandSrcValidCL));
1639 0 : handleErr(err = clSetKernelArg(warper->kern1, 6, sizeof(cl_mem),
1640 : nBandSrcValidCL));
1641 : }
1642 0 : if (warper->kern4 != nullptr)
1643 : {
1644 0 : handleErr(err = clSetKernelArg(warper->kern4, 3, sizeof(cl_mem),
1645 : unifiedSrcDensityCL));
1646 0 : handleErr(err = clSetKernelArg(warper->kern4, 4, sizeof(cl_mem),
1647 : unifiedSrcValidCL));
1648 0 : handleErr(err = clSetKernelArg(warper->kern4, 5, sizeof(cl_mem),
1649 : useBandSrcValidCL));
1650 0 : handleErr(err = clSetKernelArg(warper->kern4, 6, sizeof(cl_mem),
1651 : nBandSrcValidCL));
1652 : }
1653 :
1654 0 : return CL_SUCCESS;
1655 : }
1656 :
1657 : /*
1658 : Here we set the per-band raster data. First priority is the real raster data,
1659 : of course. Then, if applicable, we set the additional image channel. Once this
1660 : data is copied to the device, it can be freed on the host, so that is done
1661 : here. Finally the appropriate kernel arguments are set.
1662 :
1663 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1664 : */
1665 0 : static cl_int set_src_rast_data(struct oclWarper *warper, int iNum, size_t sz,
1666 : cl_channel_order chOrder, cl_mem *srcReal,
1667 : cl_mem *srcImag)
1668 : {
1669 : cl_image_format imgFmt;
1670 0 : cl_int err = CL_SUCCESS;
1671 0 : int useImagWork =
1672 0 : warper->imagWork.v != nullptr && warper->imagWork.v[iNum] != nullptr;
1673 :
1674 : // Set up image vars
1675 0 : imgFmt.image_channel_order = chOrder;
1676 0 : imgFmt.image_channel_data_type = warper->imageFormat;
1677 :
1678 : // Create & copy the source image
1679 : #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
1680 : #pragma GCC diagnostic push
1681 : #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
1682 : #endif
1683 :
1684 0 : (*srcReal) = clCreateImage2D(
1685 : warper->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &imgFmt,
1686 0 : static_cast<size_t>(warper->srcWidth),
1687 0 : static_cast<size_t>(warper->srcHeight), sz * warper->srcWidth,
1688 0 : warper->realWork.v[iNum], &err);
1689 0 : handleErr(err);
1690 :
1691 : // And the source image parts if needed
1692 0 : if (useImagWork)
1693 : {
1694 0 : (*srcImag) = clCreateImage2D(
1695 : warper->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &imgFmt,
1696 0 : static_cast<size_t>(warper->srcWidth),
1697 0 : static_cast<size_t>(warper->srcHeight), sz * warper->srcWidth,
1698 0 : warper->imagWork.v[iNum], &err);
1699 0 : handleErr(err);
1700 : }
1701 : else
1702 : {
1703 : // Make a fake image so we don't have a NULL pointer
1704 :
1705 : char dummyImageData[16];
1706 0 : (*srcImag) = clCreateImage2D(warper->context,
1707 : CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
1708 : &imgFmt, 1, 1, sz, dummyImageData, &err);
1709 :
1710 0 : handleErr(err);
1711 : }
1712 : #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
1713 : #pragma GCC diagnostic pop
1714 : #endif
1715 :
1716 : // Free the source memory, now that it's copied we don't need it
1717 0 : freeCLMem(warper->realWorkCL[iNum], warper->realWork.v[iNum]);
1718 0 : if (warper->imagWork.v != nullptr)
1719 : {
1720 0 : freeCLMem(warper->imagWorkCL[iNum], warper->imagWork.v[iNum]);
1721 : }
1722 :
1723 : // Set up per-band arguments
1724 0 : if (warper->kern1 != nullptr)
1725 : {
1726 0 : handleErr(
1727 : err = clSetKernelArg(warper->kern1, 1, sizeof(cl_mem), srcReal));
1728 0 : handleErr(
1729 : err = clSetKernelArg(warper->kern1, 2, sizeof(cl_mem), srcImag));
1730 : }
1731 0 : if (warper->kern4 != nullptr)
1732 : {
1733 0 : handleErr(
1734 : err = clSetKernelArg(warper->kern4, 1, sizeof(cl_mem), srcReal));
1735 0 : handleErr(
1736 : err = clSetKernelArg(warper->kern4, 2, sizeof(cl_mem), srcImag));
1737 : }
1738 :
1739 0 : return CL_SUCCESS;
1740 : }
1741 :
1742 : /*
1743 : Set the destination data for the raster. Although it's the output, it still
1744 : is copied to the device because some blending is done there. First the real
1745 : data is allocated and copied, then the imag data is allocated and copied if
1746 : needed. They are then set as the appropriate arguments to the kernel.
1747 :
1748 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1749 : */
1750 0 : static cl_int set_dst_rast_data(struct oclWarper *warper, int iImg, size_t sz,
1751 : cl_mem *dstReal, cl_mem *dstImag)
1752 : {
1753 0 : cl_int err = CL_SUCCESS;
1754 0 : sz *= warper->dstWidth * warper->dstHeight;
1755 :
1756 : // Copy the dst real data
1757 0 : (*dstReal) = clCreateBuffer(warper->context,
1758 : CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sz,
1759 0 : warper->dstRealWork.v[iImg], &err);
1760 0 : handleErr(err);
1761 :
1762 : // Copy the dst imag data if exists
1763 0 : if (warper->dstImagWork.v != nullptr &&
1764 0 : warper->dstImagWork.v[iImg] != nullptr)
1765 : {
1766 0 : (*dstImag) = clCreateBuffer(warper->context,
1767 : CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
1768 0 : sz, warper->dstImagWork.v[iImg], &err);
1769 0 : handleErr(err);
1770 : }
1771 : else
1772 : {
1773 0 : (*dstImag) = clCreateBuffer(warper->context, CL_MEM_READ_WRITE, 1,
1774 : nullptr, &err);
1775 0 : handleErr(err);
1776 : }
1777 :
1778 : // Set up per-band arguments
1779 0 : if (warper->kern1 != nullptr)
1780 : {
1781 0 : handleErr(
1782 : err = clSetKernelArg(warper->kern1, 7, sizeof(cl_mem), dstReal));
1783 0 : handleErr(
1784 : err = clSetKernelArg(warper->kern1, 8, sizeof(cl_mem), dstImag));
1785 : }
1786 0 : if (warper->kern4 != nullptr)
1787 : {
1788 0 : handleErr(
1789 : err = clSetKernelArg(warper->kern4, 7, sizeof(cl_mem), dstReal));
1790 0 : handleErr(
1791 : err = clSetKernelArg(warper->kern4, 8, sizeof(cl_mem), dstImag));
1792 : }
1793 :
1794 0 : return CL_SUCCESS;
1795 : }
1796 :
1797 : /*
1798 : Read the final raster data back from the graphics card to working memory. This
1799 : copies both the real memory and the imag memory if appropriate.
1800 :
1801 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1802 : */
1803 0 : static cl_int get_dst_rast_data(struct oclWarper *warper, int iImg,
1804 : size_t wordSz, cl_mem dstReal, cl_mem dstImag)
1805 : {
1806 0 : cl_int err = CL_SUCCESS;
1807 0 : size_t sz = warper->dstWidth * warper->dstHeight * wordSz;
1808 :
1809 : // Copy from dev into working memory
1810 0 : handleErr(err = clEnqueueReadBuffer(warper->queue, dstReal, CL_FALSE, 0, sz,
1811 : warper->dstRealWork.v[iImg], 0, nullptr,
1812 : nullptr));
1813 :
1814 : // If we are expecting the imag channel, then copy it back also
1815 0 : if (warper->dstImagWork.v != nullptr &&
1816 0 : warper->dstImagWork.v[iImg] != nullptr)
1817 : {
1818 0 : handleErr(err = clEnqueueReadBuffer(warper->queue, dstImag, CL_FALSE, 0,
1819 : sz, warper->dstImagWork.v[iImg], 0,
1820 : nullptr, nullptr));
1821 : }
1822 :
1823 : // The copy requests were non-blocking, so we'll need to make sure they
1824 : // finish.
1825 0 : handleErr(err = clFinish(warper->queue));
1826 :
1827 0 : return CL_SUCCESS;
1828 : }
1829 :
1830 : /*
1831 : Set the destination image density & validity mask on the device. This is used
1832 : to blend the final output image with the existing buffer. This handles the
1833 : unified structures that apply to all bands. After the buffers are created and
1834 : copied, they are set as kernel arguments.
1835 :
1836 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1837 : */
1838 0 : static cl_int set_dst_data(struct oclWarper *warper, cl_mem *dstDensityCL,
1839 : cl_mem *dstValidCL, cl_mem *dstNoDataRealCL,
1840 : float *dstDensity, unsigned int *dstValid,
1841 : float *dstNoDataReal)
1842 : {
1843 0 : cl_int err = CL_SUCCESS;
1844 0 : size_t sz = warper->dstWidth * warper->dstHeight;
1845 :
1846 : // Copy the no-data value(s)
1847 0 : if (dstNoDataReal == nullptr)
1848 : {
1849 0 : (*dstNoDataRealCL) =
1850 0 : clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
1851 0 : handleErr(err);
1852 : }
1853 : else
1854 : {
1855 0 : (*dstNoDataRealCL) = clCreateBuffer(
1856 : warper->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
1857 0 : sizeof(float) * warper->numBands, dstNoDataReal, &err);
1858 0 : handleErr(err);
1859 : }
1860 :
1861 : // Copy unifiedSrcDensity if it exists
1862 0 : if (dstDensity == nullptr)
1863 : {
1864 0 : (*dstDensityCL) =
1865 0 : clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
1866 0 : handleErr(err);
1867 : }
1868 : else
1869 : {
1870 0 : (*dstDensityCL) = clCreateBuffer(
1871 : warper->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
1872 : sizeof(float) * sz, dstDensity, &err);
1873 0 : handleErr(err);
1874 : }
1875 :
1876 : // Copy unifiedSrcValid if it exists
1877 0 : if (dstValid == nullptr)
1878 : {
1879 0 : (*dstValidCL) =
1880 0 : clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
1881 0 : handleErr(err);
1882 : }
1883 : else
1884 : {
1885 0 : (*dstValidCL) = clCreateBuffer(
1886 : warper->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
1887 0 : sizeof(int) * ((31 + sz) >> 5), dstValid, &err);
1888 0 : handleErr(err);
1889 : }
1890 :
1891 : // Set up arguments
1892 0 : if (warper->kern1 != nullptr)
1893 : {
1894 0 : handleErr(err = clSetKernelArg(warper->kern1, 9, sizeof(cl_mem),
1895 : dstNoDataRealCL));
1896 0 : handleErr(err = clSetKernelArg(warper->kern1, 10, sizeof(cl_mem),
1897 : dstDensityCL));
1898 0 : handleErr(err = clSetKernelArg(warper->kern1, 11, sizeof(cl_mem),
1899 : dstValidCL));
1900 : }
1901 0 : if (warper->kern4 != nullptr)
1902 : {
1903 0 : handleErr(err = clSetKernelArg(warper->kern4, 9, sizeof(cl_mem),
1904 : dstNoDataRealCL));
1905 0 : handleErr(err = clSetKernelArg(warper->kern4, 10, sizeof(cl_mem),
1906 : dstDensityCL));
1907 0 : handleErr(err = clSetKernelArg(warper->kern4, 11, sizeof(cl_mem),
1908 : dstValidCL));
1909 : }
1910 :
1911 0 : return CL_SUCCESS;
1912 : }
1913 :
1914 : /*
1915 : Go ahead and execute the kernel. This handles some housekeeping stuff like the
1916 : run dimensions. When running in debug mode, it times the kernel call and prints
1917 : the execution time.
1918 :
1919 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1920 : */
1921 0 : static cl_int execute_kern(struct oclWarper *warper, cl_kernel kern,
1922 : size_t loc_size)
1923 : {
1924 0 : cl_int err = CL_SUCCESS;
1925 : cl_event ev;
1926 : size_t ceil_runs[2];
1927 : size_t group_size[2];
1928 : #ifdef DEBUG_OPENCL
1929 : size_t start_time = 0;
1930 : size_t end_time;
1931 : const char *vecTxt = "";
1932 : #endif
1933 :
1934 : // Use a likely X-dimension which is a power of 2
1935 0 : if (loc_size >= 512)
1936 0 : group_size[0] = 32;
1937 0 : else if (loc_size >= 64)
1938 0 : group_size[0] = 16;
1939 0 : else if (loc_size > 8)
1940 0 : group_size[0] = 8;
1941 : else
1942 0 : group_size[0] = 1;
1943 :
1944 0 : if (group_size[0] > loc_size)
1945 0 : group_size[1] = group_size[0] / loc_size;
1946 : else
1947 0 : group_size[1] = 1;
1948 :
1949 : // Round up num_runs to find the dim of the block of pixels we'll be
1950 : // processing
1951 0 : if (warper->dstWidth % group_size[0])
1952 0 : ceil_runs[0] =
1953 0 : warper->dstWidth + group_size[0] - warper->dstWidth % group_size[0];
1954 : else
1955 0 : ceil_runs[0] = warper->dstWidth;
1956 :
1957 0 : if (warper->dstHeight % group_size[1])
1958 0 : ceil_runs[1] = warper->dstHeight + group_size[1] -
1959 0 : warper->dstHeight % group_size[1];
1960 : else
1961 0 : ceil_runs[1] = warper->dstHeight;
1962 :
1963 : #ifdef DEBUG_OPENCL
1964 : handleErr(err = clSetCommandQueueProperty(
1965 : warper->queue, CL_QUEUE_PROFILING_ENABLE, CL_TRUE, nullptr));
1966 : #endif
1967 :
1968 : // Run the calculation by enqueuing it and forcing the
1969 : // command queue to complete the task
1970 0 : handleErr(err = clEnqueueNDRangeKernel(warper->queue, kern, 2, nullptr,
1971 : ceil_runs, group_size, 0, nullptr,
1972 : &ev));
1973 0 : handleErr(err = clFinish(warper->queue));
1974 :
1975 : #ifdef DEBUG_OPENCL
1976 : handleErr(err = clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_START,
1977 : sizeof(size_t), &start_time,
1978 : nullptr));
1979 : handleErr(err =
1980 : clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_END,
1981 : sizeof(size_t), &end_time, nullptr));
1982 : assert(end_time != 0);
1983 : assert(start_time != 0);
1984 : if (kern == warper->kern4)
1985 : vecTxt = "(vec)";
1986 :
1987 : CPLDebug("OpenCL", "Kernel Time: %6s %10lu", vecTxt,
1988 : static_cast<long int>((end_time - start_time) / 100000));
1989 : #endif
1990 :
1991 0 : handleErr(err = clReleaseEvent(ev));
1992 0 : return CL_SUCCESS;
1993 : }
1994 :
1995 : /*
1996 : Copy data from a raw source to the warper's working memory. If the imag
1997 : channel is expected, then the data will be de-interlaced into component blocks
1998 : of memory.
1999 :
2000 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2001 : */
2002 0 : static cl_int set_img_data(struct oclWarper *warper, void *srcImgData,
2003 : unsigned int width, unsigned int height, int isSrc,
2004 : unsigned int bandNum, void **dstRealImgs,
2005 : void **dstImagImgs)
2006 : {
2007 0 : unsigned int imgChSize = warper->imgChSize1;
2008 : unsigned int iSrcY, i;
2009 0 : unsigned int vecOff = 0;
2010 0 : unsigned int imgNum = bandNum;
2011 0 : void *dstReal = nullptr;
2012 0 : void *dstImag = nullptr;
2013 :
2014 : // Handle vector if needed
2015 0 : if (warper->useVec &&
2016 0 : static_cast<int>(bandNum) < warper->numBands - warper->numBands % 4)
2017 : {
2018 0 : imgChSize = warper->imgChSize4;
2019 0 : vecOff = bandNum % 4;
2020 0 : imgNum = bandNum / 4;
2021 : }
2022 0 : else if (warper->useVec)
2023 : {
2024 0 : imgNum = bandNum / 4 + bandNum % 4;
2025 : }
2026 :
2027 : // Set the images as needed
2028 0 : dstReal = dstRealImgs[imgNum];
2029 0 : if (dstImagImgs == nullptr)
2030 0 : dstImag = nullptr;
2031 : else
2032 0 : dstImag = dstImagImgs[imgNum];
2033 :
2034 : // Set stuff for dst imgs
2035 0 : if (!isSrc)
2036 : {
2037 0 : vecOff *= height * width;
2038 0 : imgChSize = 1;
2039 : }
2040 :
2041 : // Copy values as needed
2042 0 : if (warper->imagWorkCL == nullptr && !(warper->useVec && isSrc))
2043 : {
2044 : // Set memory size & location depending on the data type
2045 : // This is the ideal code path for speed
2046 0 : switch (warper->imageFormat)
2047 : {
2048 0 : case CL_UNORM_INT8:
2049 : {
2050 0 : unsigned char *realDst =
2051 0 : &((static_cast<unsigned char *>(dstReal))[vecOff]);
2052 0 : memcpy(realDst, srcImgData,
2053 0 : width * height * sizeof(unsigned char));
2054 0 : break;
2055 : }
2056 0 : case CL_SNORM_INT8:
2057 : {
2058 0 : char *realDst = &((static_cast<char *>(dstReal))[vecOff]);
2059 0 : memcpy(realDst, srcImgData, width * height * sizeof(char));
2060 0 : break;
2061 : }
2062 0 : case CL_UNORM_INT16:
2063 : {
2064 0 : unsigned short *realDst =
2065 0 : &((static_cast<unsigned short *>(dstReal))[vecOff]);
2066 0 : memcpy(realDst, srcImgData,
2067 0 : width * height * sizeof(unsigned short));
2068 0 : break;
2069 : }
2070 0 : case CL_SNORM_INT16:
2071 : {
2072 0 : short *realDst = &((static_cast<short *>(dstReal))[vecOff]);
2073 0 : memcpy(realDst, srcImgData, width * height * sizeof(short));
2074 0 : break;
2075 : }
2076 0 : case CL_FLOAT:
2077 : {
2078 0 : float *realDst = &((static_cast<float *>(dstReal))[vecOff]);
2079 0 : memcpy(realDst, srcImgData, width * height * sizeof(float));
2080 0 : break;
2081 : }
2082 0 : }
2083 : }
2084 0 : else if (warper->imagWorkCL == nullptr)
2085 : {
2086 : // We need to space the values due to OpenCL implementation reasons
2087 0 : for (iSrcY = 0; iSrcY < height; iSrcY++)
2088 : {
2089 0 : int pxOff = width * iSrcY;
2090 0 : int imgOff = imgChSize * pxOff + vecOff;
2091 : // Copy & deinterleave interleaved data
2092 0 : switch (warper->imageFormat)
2093 : {
2094 0 : case CL_UNORM_INT8:
2095 : {
2096 0 : unsigned char *realDst =
2097 0 : &((static_cast<unsigned char *>(dstReal))[imgOff]);
2098 0 : unsigned char *dataSrc =
2099 0 : &((static_cast<unsigned char *>(srcImgData))[pxOff]);
2100 0 : for (i = 0; i < width; ++i)
2101 0 : realDst[imgChSize * i] = dataSrc[i];
2102 : }
2103 0 : break;
2104 0 : case CL_SNORM_INT8:
2105 : {
2106 0 : char *realDst = &((static_cast<char *>(dstReal))[imgOff]);
2107 0 : char *dataSrc = &((static_cast<char *>(srcImgData))[pxOff]);
2108 0 : for (i = 0; i < width; ++i)
2109 0 : realDst[imgChSize * i] = dataSrc[i];
2110 : }
2111 0 : break;
2112 0 : case CL_UNORM_INT16:
2113 : {
2114 0 : unsigned short *realDst =
2115 0 : &((static_cast<unsigned short *>(dstReal))[imgOff]);
2116 0 : unsigned short *dataSrc =
2117 0 : &((static_cast<unsigned short *>(srcImgData))[pxOff]);
2118 0 : for (i = 0; i < width; ++i)
2119 0 : realDst[imgChSize * i] = dataSrc[i];
2120 : }
2121 0 : break;
2122 0 : case CL_SNORM_INT16:
2123 : {
2124 0 : short *realDst = &((static_cast<short *>(dstReal))[imgOff]);
2125 0 : short *dataSrc =
2126 0 : &((static_cast<short *>(srcImgData))[pxOff]);
2127 0 : for (i = 0; i < width; ++i)
2128 0 : realDst[imgChSize * i] = dataSrc[i];
2129 : }
2130 0 : break;
2131 0 : case CL_FLOAT:
2132 : {
2133 0 : float *realDst = &((static_cast<float *>(dstReal))[imgOff]);
2134 0 : float *dataSrc =
2135 0 : &((static_cast<float *>(srcImgData))[pxOff]);
2136 0 : for (i = 0; i < width; ++i)
2137 0 : realDst[imgChSize * i] = dataSrc[i];
2138 : }
2139 0 : break;
2140 : }
2141 : }
2142 : }
2143 : else
2144 : {
2145 0 : assert(dstImag);
2146 :
2147 : // Copy, deinterleave, & space interleaved data
2148 0 : for (iSrcY = 0; iSrcY < height; iSrcY++)
2149 : {
2150 0 : int pxOff = width * iSrcY;
2151 0 : int imgOff = imgChSize * pxOff + vecOff;
2152 : // Copy & deinterleave interleaved data
2153 0 : switch (warper->imageFormat)
2154 : {
2155 0 : case CL_FLOAT:
2156 : {
2157 0 : float *realDst = &((static_cast<float *>(dstReal))[imgOff]);
2158 0 : float *imagDst = &((static_cast<float *>(dstImag))[imgOff]);
2159 0 : float *dataSrc =
2160 0 : &((static_cast<float *>(srcImgData))[pxOff]);
2161 0 : for (i = 0; i < width; ++i)
2162 : {
2163 0 : realDst[imgChSize * i] = dataSrc[i * 2];
2164 0 : imagDst[imgChSize * i] = dataSrc[i * 2 + 1];
2165 : }
2166 : }
2167 0 : break;
2168 0 : case CL_SNORM_INT8:
2169 : {
2170 0 : char *realDst = &((static_cast<char *>(dstReal))[imgOff]);
2171 0 : char *imagDst = &((static_cast<char *>(dstImag))[imgOff]);
2172 0 : char *dataSrc = &((static_cast<char *>(srcImgData))[pxOff]);
2173 0 : for (i = 0; i < width; ++i)
2174 : {
2175 0 : realDst[imgChSize * i] = dataSrc[i * 2];
2176 0 : imagDst[imgChSize * i] = dataSrc[i * 2 + 1];
2177 : }
2178 : }
2179 0 : break;
2180 0 : case CL_UNORM_INT8:
2181 : {
2182 0 : unsigned char *realDst =
2183 0 : &((static_cast<unsigned char *>(dstReal))[imgOff]);
2184 0 : unsigned char *imagDst =
2185 0 : &((static_cast<unsigned char *>(dstImag))[imgOff]);
2186 0 : unsigned char *dataSrc =
2187 0 : &((static_cast<unsigned char *>(srcImgData))[pxOff]);
2188 0 : for (i = 0; i < width; ++i)
2189 : {
2190 0 : realDst[imgChSize * i] = dataSrc[i * 2];
2191 0 : imagDst[imgChSize * i] = dataSrc[i * 2 + 1];
2192 : }
2193 : }
2194 0 : break;
2195 0 : case CL_SNORM_INT16:
2196 : {
2197 0 : short *realDst = &((static_cast<short *>(dstReal))[imgOff]);
2198 0 : short *imagDst = &((static_cast<short *>(dstImag))[imgOff]);
2199 0 : short *dataSrc =
2200 0 : &((static_cast<short *>(srcImgData))[pxOff]);
2201 0 : for (i = 0; i < width; ++i)
2202 : {
2203 0 : realDst[imgChSize * i] = dataSrc[i * 2];
2204 0 : imagDst[imgChSize * i] = dataSrc[i * 2 + 1];
2205 : }
2206 : }
2207 0 : break;
2208 0 : case CL_UNORM_INT16:
2209 : {
2210 0 : unsigned short *realDst =
2211 0 : &((static_cast<unsigned short *>(dstReal))[imgOff]);
2212 0 : unsigned short *imagDst =
2213 0 : &((static_cast<unsigned short *>(dstImag))[imgOff]);
2214 0 : unsigned short *dataSrc =
2215 0 : &((static_cast<unsigned short *>(srcImgData))[pxOff]);
2216 0 : for (i = 0; i < width; ++i)
2217 : {
2218 0 : realDst[imgChSize * i] = dataSrc[i * 2];
2219 0 : imagDst[imgChSize * i] = dataSrc[i * 2 + 1];
2220 : }
2221 : }
2222 0 : break;
2223 : }
2224 : }
2225 : }
2226 :
2227 0 : return CL_SUCCESS;
2228 : }
2229 :
2230 : /*
2231 : Creates the struct which inits & contains the OpenCL context & environment.
2232 : Inits wired(?) space to buffer the image in host RAM. Chooses the OpenCL
2233 : device, perhaps the user can choose it later? This would also choose the
2234 : appropriate OpenCL image format (R, RG, RGBA, or multiples thereof). Space
2235 : for metadata can be allocated as required, though.
2236 :
2237 : Supported image formats are:
2238 : CL_FLOAT, CL_SNORM_INT8, CL_UNORM_INT8, CL_SNORM_INT16, CL_UNORM_INT16
2239 : 32-bit int formats won't keep precision when converted to floats internally
2240 : and doubles are generally not supported on the GPU image formats.
2241 : */
2242 0 : struct oclWarper *GDALWarpKernelOpenCL_createEnv(
2243 : int srcWidth, int srcHeight, int dstWidth, int dstHeight,
2244 : cl_channel_type imageFormat, int numBands, int coordMult, int useImag,
2245 : int useBandSrcValid, CPL_UNUSED float *fDstDensity, double *dfDstNoDataReal,
2246 : OCLResampAlg resampAlg, cl_int *clErr)
2247 : {
2248 : struct oclWarper *warper;
2249 : int i;
2250 0 : size_t maxWidth = 0, maxHeight = 0;
2251 0 : cl_int err = CL_SUCCESS;
2252 : size_t fmtSize, sz;
2253 : cl_device_id device;
2254 : cl_bool bool_flag;
2255 0 : OCLVendor eCLVendor = VENDOR_OTHER;
2256 :
2257 : // Do we have a suitable OpenCL device?
2258 0 : device = get_device(&eCLVendor);
2259 0 : if (device == nullptr)
2260 0 : return nullptr;
2261 :
2262 0 : err = clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool),
2263 : &bool_flag, &sz);
2264 0 : if (err != CL_SUCCESS || !bool_flag)
2265 : {
2266 0 : CPLDebug("OpenCL", "No image support on selected device.");
2267 0 : return nullptr;
2268 : }
2269 :
2270 : // Set up warper environment.
2271 : warper =
2272 0 : static_cast<struct oclWarper *>(CPLCalloc(1, sizeof(struct oclWarper)));
2273 :
2274 0 : warper->eCLVendor = eCLVendor;
2275 :
2276 : // Init passed vars
2277 0 : warper->srcWidth = srcWidth;
2278 0 : warper->srcHeight = srcHeight;
2279 0 : warper->dstWidth = dstWidth;
2280 0 : warper->dstHeight = dstHeight;
2281 :
2282 0 : warper->coordMult = coordMult;
2283 0 : warper->numBands = numBands;
2284 0 : warper->imageFormat = imageFormat;
2285 0 : warper->resampAlg = resampAlg;
2286 :
2287 0 : warper->useUnifiedSrcDensity = FALSE;
2288 0 : warper->useUnifiedSrcValid = FALSE;
2289 0 : warper->useDstDensity = FALSE;
2290 0 : warper->useDstValid = FALSE;
2291 :
2292 0 : warper->imagWorkCL = nullptr;
2293 0 : warper->dstImagWorkCL = nullptr;
2294 0 : warper->useBandSrcValidCL = nullptr;
2295 0 : warper->useBandSrcValid = nullptr;
2296 0 : warper->nBandSrcValidCL = nullptr;
2297 0 : warper->nBandSrcValid = nullptr;
2298 0 : warper->fDstNoDataRealCL = nullptr;
2299 0 : warper->fDstNoDataReal = nullptr;
2300 0 : warper->kern1 = nullptr;
2301 0 : warper->kern4 = nullptr;
2302 :
2303 0 : warper->dev = device;
2304 :
2305 0 : warper->context =
2306 0 : clCreateContext(nullptr, 1, &(warper->dev), nullptr, nullptr, &err);
2307 0 : handleErrGoto(err, error_label);
2308 : #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
2309 : #pragma GCC diagnostic push
2310 : #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2311 : #endif
2312 0 : warper->queue = clCreateCommandQueue(warper->context, warper->dev, 0, &err);
2313 : #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
2314 : #pragma GCC diagnostic pop
2315 : #endif
2316 0 : handleErrGoto(err, error_label);
2317 :
2318 : // Ensure that we hand handle imagery of these dimensions
2319 0 : err = clGetDeviceInfo(warper->dev, CL_DEVICE_IMAGE2D_MAX_WIDTH,
2320 : sizeof(size_t), &maxWidth, &sz);
2321 0 : handleErrGoto(err, error_label);
2322 0 : err = clGetDeviceInfo(warper->dev, CL_DEVICE_IMAGE2D_MAX_HEIGHT,
2323 : sizeof(size_t), &maxHeight, &sz);
2324 0 : handleErrGoto(err, error_label);
2325 0 : if (maxWidth < static_cast<size_t>(srcWidth) ||
2326 0 : maxHeight < static_cast<size_t>(srcHeight))
2327 : {
2328 0 : err = CL_INVALID_IMAGE_SIZE;
2329 0 : handleErrGoto(err, error_label);
2330 : }
2331 :
2332 : // Split bands into sets of four when possible
2333 : // Cubic runs slower as vector, so don't use it (probably register pressure)
2334 : // Feel free to do more testing and come up with more precise case
2335 : // statements
2336 0 : if (numBands < 4 || resampAlg == OCL_Cubic)
2337 : {
2338 0 : warper->numImages = numBands;
2339 0 : warper->useVec = FALSE;
2340 : }
2341 : else
2342 : {
2343 0 : warper->numImages = numBands / 4 + numBands % 4;
2344 0 : warper->useVec = TRUE;
2345 : }
2346 :
2347 : // Make the pointer space for the real images
2348 0 : warper->realWorkCL =
2349 0 : static_cast<cl_mem *>(CPLCalloc(sizeof(cl_mem), warper->numImages));
2350 0 : warper->dstRealWorkCL =
2351 0 : static_cast<cl_mem *>(CPLCalloc(sizeof(cl_mem), warper->numImages));
2352 :
2353 : // Make space for the per-channel Imag data (if exists)
2354 0 : if (useImag)
2355 : {
2356 0 : warper->imagWorkCL =
2357 0 : static_cast<cl_mem *>(CPLCalloc(sizeof(cl_mem), warper->numImages));
2358 0 : warper->dstImagWorkCL =
2359 0 : static_cast<cl_mem *>(CPLCalloc(sizeof(cl_mem), warper->numImages));
2360 : }
2361 :
2362 : // Make space for the per-band BandSrcValid data (if exists)
2363 0 : if (useBandSrcValid)
2364 : {
2365 : // 32 bits in the mask
2366 0 : sz = warper->numBands *
2367 0 : ((31 + warper->srcWidth * warper->srcHeight) >> 5);
2368 :
2369 : // Allocate some space for the validity of the validity mask
2370 : void *useBandSrcValidTab[1];
2371 : cl_mem useBandSrcValidCLTab[1];
2372 0 : err = alloc_pinned_mem(warper, 0, warper->numBands * sizeof(char),
2373 : useBandSrcValidTab, useBandSrcValidCLTab);
2374 0 : warper->useBandSrcValid = static_cast<char *>(useBandSrcValidTab[0]);
2375 0 : warper->useBandSrcValidCL = useBandSrcValidCLTab[0];
2376 0 : handleErrGoto(err, error_label);
2377 :
2378 0 : for (i = 0; i < warper->numBands; ++i)
2379 0 : warper->useBandSrcValid[i] = FALSE;
2380 :
2381 : // Allocate one array for all the band validity masks.
2382 : // Remember that the masks don't use much memory (they're bitwise).
2383 : void *nBandSrcValidTab[1];
2384 : cl_mem nBandSrcValidCLTab[1];
2385 0 : err = alloc_pinned_mem(warper, 0, sz * sizeof(int), nBandSrcValidTab,
2386 : nBandSrcValidCLTab);
2387 0 : warper->nBandSrcValid = static_cast<float *>(nBandSrcValidTab[0]);
2388 0 : warper->nBandSrcValidCL = nBandSrcValidCLTab[0];
2389 0 : handleErrGoto(err, error_label);
2390 : }
2391 :
2392 : // Make space for the per-band
2393 0 : if (dfDstNoDataReal != nullptr)
2394 : {
2395 : void *fDstNoDataRealTab[1];
2396 : cl_mem fDstNoDataRealCLTab[1];
2397 0 : alloc_pinned_mem(warper, 0, warper->numBands, fDstNoDataRealTab,
2398 : fDstNoDataRealCLTab);
2399 0 : warper->fDstNoDataReal = static_cast<float *>(fDstNoDataRealTab[0]);
2400 0 : warper->fDstNoDataRealCL = fDstNoDataRealCLTab[0];
2401 :
2402 : // Copy over values
2403 0 : for (i = 0; i < warper->numBands; ++i)
2404 0 : warper->fDstNoDataReal[i] = static_cast<float>(dfDstNoDataReal[i]);
2405 : }
2406 :
2407 : // Alloc working host image memory
2408 : // We'll be copying into these buffers soon
2409 0 : switch (imageFormat)
2410 : {
2411 0 : case CL_FLOAT:
2412 0 : err = alloc_working_arr(warper, sizeof(float *), sizeof(float),
2413 : &fmtSize);
2414 0 : break;
2415 0 : case CL_SNORM_INT8:
2416 0 : err = alloc_working_arr(warper, sizeof(char *), sizeof(char),
2417 : &fmtSize);
2418 0 : break;
2419 0 : case CL_UNORM_INT8:
2420 0 : err = alloc_working_arr(warper, sizeof(unsigned char *),
2421 : sizeof(unsigned char), &fmtSize);
2422 0 : break;
2423 0 : case CL_SNORM_INT16:
2424 0 : err = alloc_working_arr(warper, sizeof(short *), sizeof(short),
2425 : &fmtSize);
2426 0 : break;
2427 0 : case CL_UNORM_INT16:
2428 0 : err = alloc_working_arr(warper, sizeof(unsigned short *),
2429 : sizeof(unsigned short), &fmtSize);
2430 0 : break;
2431 : }
2432 0 : handleErrGoto(err, error_label);
2433 :
2434 : // Find a good & compatible image channel order for the Lat/Long array.
2435 0 : err = set_supported_formats(warper, 2, &(warper->xyChOrder),
2436 : &(warper->xyChSize), CL_FLOAT);
2437 0 : handleErrGoto(err, error_label);
2438 :
2439 : // Set coordinate image dimensions
2440 0 : warper->xyWidth =
2441 0 : static_cast<int>(ceil((static_cast<float>(warper->dstWidth) +
2442 0 : static_cast<float>(warper->coordMult) - 1) /
2443 0 : static_cast<float>(warper->coordMult)));
2444 0 : warper->xyHeight =
2445 0 : static_cast<int>(ceil((static_cast<float>(warper->dstHeight) +
2446 0 : static_cast<float>(warper->coordMult) - 1) /
2447 0 : static_cast<float>(warper->coordMult)));
2448 :
2449 : // Alloc coord memory
2450 0 : sz = sizeof(float) * warper->xyChSize * warper->xyWidth * warper->xyHeight;
2451 : void *xyWorkTab[1];
2452 : cl_mem xyWorkCLTab[1];
2453 0 : err = alloc_pinned_mem(warper, 0, sz, xyWorkTab, xyWorkCLTab);
2454 0 : warper->xyWork = static_cast<float *>(xyWorkTab[0]);
2455 0 : warper->xyWorkCL = xyWorkCLTab[0];
2456 0 : handleErrGoto(err, error_label);
2457 :
2458 : // Ensure everything is finished allocating, copying, & mapping
2459 0 : err = clFinish(warper->queue);
2460 0 : handleErrGoto(err, error_label);
2461 :
2462 0 : (*clErr) = CL_SUCCESS;
2463 0 : return warper;
2464 :
2465 0 : error_label:
2466 0 : GDALWarpKernelOpenCL_deleteEnv(warper);
2467 0 : return nullptr;
2468 : }
2469 :
2470 : /*
2471 : Copy the validity mask for an image band to the warper.
2472 :
2473 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2474 : */
2475 0 : cl_int GDALWarpKernelOpenCL_setSrcValid(struct oclWarper *warper,
2476 : int *bandSrcValid, int bandNum)
2477 : {
2478 : // 32 bits in the mask
2479 0 : int stride = (31 + warper->srcWidth * warper->srcHeight) >> 5;
2480 :
2481 : // Copy bandSrcValid
2482 0 : assert(warper->nBandSrcValid != nullptr);
2483 0 : memcpy(&(warper->nBandSrcValid[bandNum * stride]), bandSrcValid,
2484 0 : sizeof(int) * stride);
2485 0 : warper->useBandSrcValid[bandNum] = TRUE;
2486 :
2487 0 : return CL_SUCCESS;
2488 : }
2489 :
2490 : /*
2491 : Sets the source image real & imag into the host memory so that it is
2492 : permuted (ex. RGBA) for better graphics card access.
2493 :
2494 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2495 : */
2496 0 : cl_int GDALWarpKernelOpenCL_setSrcImg(struct oclWarper *warper, void *imgData,
2497 : int bandNum)
2498 : {
2499 0 : void **imagWorkPtr = nullptr;
2500 :
2501 0 : if (warper->imagWorkCL != nullptr)
2502 0 : imagWorkPtr = warper->imagWork.v;
2503 :
2504 0 : return set_img_data(warper, imgData, warper->srcWidth, warper->srcHeight,
2505 0 : TRUE, bandNum, warper->realWork.v, imagWorkPtr);
2506 : }
2507 :
2508 : /*
2509 : Sets the destination image real & imag into the host memory so that it is
2510 : permuted (ex. RGBA) for better graphics card access.
2511 :
2512 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2513 : */
2514 0 : cl_int GDALWarpKernelOpenCL_setDstImg(struct oclWarper *warper, void *imgData,
2515 : int bandNum)
2516 : {
2517 0 : void **dstImagWorkPtr = nullptr;
2518 :
2519 0 : if (warper->dstImagWorkCL != nullptr)
2520 0 : dstImagWorkPtr = warper->dstImagWork.v;
2521 :
2522 0 : return set_img_data(warper, imgData, warper->dstWidth, warper->dstHeight,
2523 0 : FALSE, bandNum, warper->dstRealWork.v, dstImagWorkPtr);
2524 : }
2525 :
2526 : /*
2527 : Inputs the source coordinates for a row of the destination pixels. Invalid
2528 : coordinates are set as -99.0, which should be out of the image bounds. Sets
2529 : the coordinates as ready to be used in OpenCL image memory: interleaved and
2530 : minus the offset. By using image memory, we can use a smaller texture for
2531 : coordinates and use OpenCL's built-in interpolation to save memory.
2532 :
2533 : What it does: generates a smaller matrix of X/Y coordinate transformation
2534 : values from an original matrix. When bilinearly sampled in the GPU hardware,
2535 : the generated values are as close as possible to the original matrix.
2536 :
2537 : Complication: matrices have arbitrary dimensions and the sub-sampling factor
2538 : is an arbitrary integer greater than zero. Getting the edge cases right is
2539 : difficult.
2540 :
2541 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2542 : */
2543 0 : cl_int GDALWarpKernelOpenCL_setCoordRow(struct oclWarper *warper,
2544 : double *rowSrcX, double *rowSrcY,
2545 : double srcXOff, double srcYOff,
2546 : int *success, int rowNum)
2547 : {
2548 0 : int coordMult = warper->coordMult;
2549 0 : int width = warper->dstWidth;
2550 0 : int height = warper->dstHeight;
2551 0 : int xyWidth = warper->xyWidth;
2552 : int i;
2553 0 : int xyChSize = warper->xyChSize;
2554 0 : float *xyPtr, *xyPrevPtr = nullptr;
2555 0 : int lastRow = rowNum == height - 1;
2556 0 : double dstHeightMod = 1.0, dstWidthMod = 1.0;
2557 :
2558 : // Return if we're at an off row
2559 0 : if (!lastRow && rowNum % coordMult != 0)
2560 0 : return CL_SUCCESS;
2561 :
2562 : // Standard row, adjusted for the skipped rows
2563 0 : xyPtr = &(warper->xyWork[xyWidth * xyChSize * rowNum / coordMult]);
2564 :
2565 : // Find our row
2566 0 : if (lastRow)
2567 : {
2568 : // Setup for the final row
2569 0 : xyPtr = &(warper->xyWork[xyWidth * xyChSize * (warper->xyHeight - 1)]);
2570 0 : xyPrevPtr =
2571 0 : &(warper->xyWork[xyWidth * xyChSize * (warper->xyHeight - 2)]);
2572 :
2573 0 : if ((height - 1) % coordMult)
2574 0 : dstHeightMod = static_cast<double>(coordMult) /
2575 0 : static_cast<double>((height - 1) % coordMult);
2576 : }
2577 :
2578 : // Copy selected coordinates
2579 0 : for (i = 0; i < width; i += coordMult)
2580 : {
2581 0 : if (success[i])
2582 : {
2583 0 : xyPtr[0] = static_cast<float>(rowSrcX[i] - srcXOff);
2584 0 : xyPtr[1] = static_cast<float>(rowSrcY[i] - srcYOff);
2585 :
2586 0 : if (lastRow)
2587 : {
2588 : // Adjust bottom row so interpolator returns correct value
2589 0 : xyPtr[0] = static_cast<float>(
2590 0 : dstHeightMod * (xyPtr[0] - xyPrevPtr[0]) + xyPrevPtr[0]);
2591 0 : xyPtr[1] = static_cast<float>(
2592 0 : dstHeightMod * (xyPtr[1] - xyPrevPtr[1]) + xyPrevPtr[1]);
2593 : }
2594 : }
2595 : else
2596 : {
2597 0 : xyPtr[0] = -99.0f;
2598 0 : xyPtr[1] = -99.0f;
2599 : }
2600 :
2601 0 : xyPtr += xyChSize;
2602 0 : xyPrevPtr += xyChSize;
2603 : }
2604 :
2605 : // Copy remaining coordinate
2606 0 : if ((width - 1) % coordMult)
2607 : {
2608 0 : dstWidthMod = static_cast<double>(coordMult) /
2609 0 : static_cast<double>((width - 1) % coordMult);
2610 0 : xyPtr -= xyChSize;
2611 0 : xyPrevPtr -= xyChSize;
2612 : }
2613 : else
2614 : {
2615 0 : xyPtr -= xyChSize * 2;
2616 0 : xyPrevPtr -= xyChSize * 2;
2617 : }
2618 :
2619 0 : if (lastRow)
2620 : {
2621 0 : double origX = rowSrcX[width - 1] - srcXOff;
2622 0 : double origY = rowSrcY[width - 1] - srcYOff;
2623 0 : double a = 1.0, b = 1.0;
2624 :
2625 : // Calculate the needed x/y values using an equation from the OpenCL
2626 : // Spec section 8.2, solving for Ti1j1
2627 0 : if ((width - 1) % coordMult)
2628 0 : a = ((width - 1) % coordMult) / static_cast<double>(coordMult);
2629 :
2630 0 : if ((height - 1) % coordMult)
2631 0 : b = ((height - 1) % coordMult) / static_cast<double>(coordMult);
2632 :
2633 0 : xyPtr[xyChSize] = static_cast<float>(
2634 0 : (((1.0 - a) * (1.0 - b) * xyPrevPtr[0] +
2635 0 : a * (1.0 - b) * xyPrevPtr[xyChSize] + (1.0 - a) * b * xyPtr[0]) -
2636 0 : origX) /
2637 0 : (-a * b));
2638 :
2639 0 : xyPtr[xyChSize + 1] =
2640 0 : static_cast<float>((((1.0 - a) * (1.0 - b) * xyPrevPtr[1] +
2641 0 : a * (1.0 - b) * xyPrevPtr[xyChSize + 1] +
2642 0 : (1.0 - a) * b * xyPtr[1]) -
2643 0 : origY) /
2644 0 : (-a * b));
2645 : }
2646 : else
2647 : {
2648 : // Adjust last coordinate so interpolator returns correct value
2649 0 : xyPtr[xyChSize] = static_cast<float>(
2650 0 : dstWidthMod * (rowSrcX[width - 1] - srcXOff - xyPtr[0]) + xyPtr[0]);
2651 0 : xyPtr[xyChSize + 1] = static_cast<float>(
2652 0 : dstWidthMod * (rowSrcY[width - 1] - srcYOff - xyPtr[1]) + xyPtr[1]);
2653 : }
2654 :
2655 0 : return CL_SUCCESS;
2656 : }
2657 :
2658 : /*
2659 : Copies all data to the device RAM, frees the host RAM, runs the
2660 : appropriate resampling kernel, mallocs output space, & copies the data
2661 : back from the device RAM for each band. Also check to make sure that
2662 : setRow*() was called the appropriate number of times to init all image
2663 : data.
2664 :
2665 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2666 : */
2667 0 : cl_int GDALWarpKernelOpenCL_runResamp(
2668 : struct oclWarper *warper, float *unifiedSrcDensity,
2669 : unsigned int *unifiedSrcValid, float *dstDensity, unsigned int *dstValid,
2670 : double dfXScale, double dfYScale, double dfXFilter, double dfYFilter,
2671 : int nXRadius, int nYRadius, int nFiltInitX, int nFiltInitY)
2672 : {
2673 0 : int i, nextBandNum = 0, chSize = 1;
2674 0 : cl_int err = CL_SUCCESS;
2675 : cl_mem xy, unifiedSrcDensityCL, unifiedSrcValidCL;
2676 : cl_mem dstDensityCL, dstValidCL, dstNoDataRealCL;
2677 : cl_mem useBandSrcValidCL, nBandSrcValidCL;
2678 0 : size_t groupSize, wordSize = 0;
2679 0 : cl_kernel kern = nullptr;
2680 : cl_channel_order chOrder;
2681 :
2682 0 : warper->useUnifiedSrcDensity = unifiedSrcDensity != nullptr;
2683 0 : warper->useUnifiedSrcValid = unifiedSrcValid != nullptr;
2684 :
2685 : // Check the word size
2686 0 : switch (warper->imageFormat)
2687 : {
2688 0 : case CL_FLOAT:
2689 0 : wordSize = sizeof(float);
2690 0 : break;
2691 0 : case CL_SNORM_INT8:
2692 0 : wordSize = sizeof(char);
2693 0 : break;
2694 0 : case CL_UNORM_INT8:
2695 0 : wordSize = sizeof(unsigned char);
2696 0 : break;
2697 0 : case CL_SNORM_INT16:
2698 0 : wordSize = sizeof(short);
2699 0 : break;
2700 0 : case CL_UNORM_INT16:
2701 0 : wordSize = sizeof(unsigned short);
2702 0 : break;
2703 : }
2704 :
2705 : // Compile the kernel; the invariants are being compiled into the code
2706 0 : if (!warper->useVec || warper->numBands % 4)
2707 : {
2708 0 : warper->kern1 =
2709 0 : get_kernel(warper, FALSE, dfXScale, dfYScale, dfXFilter, dfYFilter,
2710 : nXRadius, nYRadius, nFiltInitX, nFiltInitY, &err);
2711 0 : handleErr(err);
2712 : }
2713 0 : if (warper->useVec)
2714 : {
2715 0 : warper->kern4 =
2716 0 : get_kernel(warper, TRUE, dfXScale, dfYScale, dfXFilter, dfYFilter,
2717 : nXRadius, nYRadius, nFiltInitX, nFiltInitY, &err);
2718 0 : handleErr(err);
2719 : }
2720 :
2721 : // Copy coord data to the device
2722 0 : handleErr(err = set_coord_data(warper, &xy));
2723 :
2724 : // Copy unified density & valid data
2725 0 : handleErr(err = set_unified_data(warper, &unifiedSrcDensityCL,
2726 : &unifiedSrcValidCL, unifiedSrcDensity,
2727 : unifiedSrcValid, &useBandSrcValidCL,
2728 : &nBandSrcValidCL));
2729 :
2730 : // Copy output density & valid data
2731 0 : handleErr(set_dst_data(warper, &dstDensityCL, &dstValidCL, &dstNoDataRealCL,
2732 : dstDensity, dstValid, warper->fDstNoDataReal));
2733 :
2734 : // What's the recommended group size?
2735 0 : if (warper->useVec)
2736 : {
2737 : // Start with the vector kernel
2738 0 : handleErr(clGetKernelWorkGroupInfo(
2739 : warper->kern4, warper->dev, CL_KERNEL_WORK_GROUP_SIZE,
2740 : sizeof(size_t), &groupSize, nullptr));
2741 0 : kern = warper->kern4;
2742 0 : chSize = warper->imgChSize4;
2743 0 : chOrder = warper->imgChOrder4;
2744 : }
2745 : else
2746 : {
2747 : // We're only using the float kernel
2748 0 : handleErr(clGetKernelWorkGroupInfo(
2749 : warper->kern1, warper->dev, CL_KERNEL_WORK_GROUP_SIZE,
2750 : sizeof(size_t), &groupSize, nullptr));
2751 0 : kern = warper->kern1;
2752 0 : chSize = warper->imgChSize1;
2753 0 : chOrder = warper->imgChOrder1;
2754 : }
2755 :
2756 : // Loop over each image
2757 0 : for (i = 0; i < warper->numImages; ++i)
2758 : {
2759 : cl_mem srcImag, srcReal;
2760 : cl_mem dstReal, dstImag;
2761 0 : int bandNum = nextBandNum;
2762 :
2763 : // Switch kernels if needed
2764 0 : if (warper->useVec &&
2765 0 : nextBandNum < warper->numBands - warper->numBands % 4)
2766 : {
2767 0 : nextBandNum += 4;
2768 : }
2769 : else
2770 : {
2771 0 : if (kern == warper->kern4)
2772 : {
2773 0 : handleErr(clGetKernelWorkGroupInfo(
2774 : warper->kern1, warper->dev, CL_KERNEL_WORK_GROUP_SIZE,
2775 : sizeof(size_t), &groupSize, nullptr));
2776 0 : kern = warper->kern1;
2777 0 : chSize = warper->imgChSize1;
2778 0 : chOrder = warper->imgChOrder1;
2779 : }
2780 0 : ++nextBandNum;
2781 : }
2782 :
2783 : // Create & copy the source image
2784 0 : handleErr(err = set_src_rast_data(warper, i, chSize * wordSize, chOrder,
2785 : &srcReal, &srcImag));
2786 :
2787 : // Create & copy the output image
2788 0 : if (kern == warper->kern1)
2789 : {
2790 0 : handleErr(err = set_dst_rast_data(warper, i, wordSize, &dstReal,
2791 : &dstImag));
2792 : }
2793 : else
2794 : {
2795 0 : handleErr(err = set_dst_rast_data(warper, i, wordSize * 4, &dstReal,
2796 : &dstImag));
2797 : }
2798 :
2799 : // Set the bandNum
2800 0 : handleErr(err = clSetKernelArg(kern, 12, sizeof(int), &bandNum));
2801 :
2802 : // Run the kernel
2803 0 : handleErr(err = execute_kern(warper, kern, groupSize));
2804 :
2805 : // Free loop CL mem
2806 0 : handleErr(err = clReleaseMemObject(srcReal));
2807 0 : handleErr(err = clReleaseMemObject(srcImag));
2808 :
2809 : // Copy the back output results
2810 0 : if (kern == warper->kern1)
2811 : {
2812 0 : handleErr(
2813 : err = get_dst_rast_data(warper, i, wordSize, dstReal, dstImag));
2814 : }
2815 : else
2816 : {
2817 0 : handleErr(err = get_dst_rast_data(warper, i, wordSize * 4, dstReal,
2818 : dstImag));
2819 : }
2820 :
2821 : // Free remaining CL mem
2822 0 : handleErr(err = clReleaseMemObject(dstReal));
2823 0 : handleErr(err = clReleaseMemObject(dstImag));
2824 : }
2825 :
2826 : // Free remaining CL mem
2827 0 : handleErr(err = clReleaseMemObject(xy));
2828 0 : handleErr(err = clReleaseMemObject(unifiedSrcDensityCL));
2829 0 : handleErr(err = clReleaseMemObject(unifiedSrcValidCL));
2830 0 : handleErr(err = clReleaseMemObject(useBandSrcValidCL));
2831 0 : handleErr(err = clReleaseMemObject(nBandSrcValidCL));
2832 0 : handleErr(err = clReleaseMemObject(dstDensityCL));
2833 0 : handleErr(err = clReleaseMemObject(dstValidCL));
2834 0 : handleErr(err = clReleaseMemObject(dstNoDataRealCL));
2835 :
2836 0 : return CL_SUCCESS;
2837 : }
2838 :
2839 : /*
2840 : Sets pointers to the floating point data in the warper. The pointers
2841 : are internal to the warper structure, so don't free() them. If the imag
2842 : channel is in use, it will receive a pointer. Otherwise it'll be set to NULL.
2843 : These are pointers to floating point data, so the caller will need to
2844 : manipulate the output as appropriate before saving the data.
2845 :
2846 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2847 : */
2848 0 : cl_int GDALWarpKernelOpenCL_getRow(struct oclWarper *warper, void **rowReal,
2849 : void **rowImag, int rowNum, int bandNum)
2850 : {
2851 0 : int memOff = rowNum * warper->dstWidth;
2852 0 : int imgNum = bandNum;
2853 :
2854 0 : if (warper->useVec && bandNum < warper->numBands - warper->numBands % 4)
2855 : {
2856 0 : memOff += warper->dstWidth * warper->dstHeight * (bandNum % 4);
2857 0 : imgNum = bandNum / 4;
2858 : }
2859 0 : else if (warper->useVec)
2860 : {
2861 0 : imgNum = bandNum / 4 + bandNum % 4;
2862 : }
2863 :
2864 : // Return pointers into the warper's data
2865 0 : switch (warper->imageFormat)
2866 : {
2867 0 : case CL_FLOAT:
2868 0 : (*rowReal) = &(warper->dstRealWork.f[imgNum][memOff]);
2869 0 : break;
2870 0 : case CL_SNORM_INT8:
2871 0 : (*rowReal) = &(warper->dstRealWork.c[imgNum][memOff]);
2872 0 : break;
2873 0 : case CL_UNORM_INT8:
2874 0 : (*rowReal) = &(warper->dstRealWork.uc[imgNum][memOff]);
2875 0 : break;
2876 0 : case CL_SNORM_INT16:
2877 0 : (*rowReal) = &(warper->dstRealWork.s[imgNum][memOff]);
2878 0 : break;
2879 0 : case CL_UNORM_INT16:
2880 0 : (*rowReal) = &(warper->dstRealWork.us[imgNum][memOff]);
2881 0 : break;
2882 : }
2883 :
2884 0 : if (warper->dstImagWorkCL == nullptr)
2885 : {
2886 0 : (*rowImag) = nullptr;
2887 : }
2888 : else
2889 : {
2890 0 : switch (warper->imageFormat)
2891 : {
2892 0 : case CL_FLOAT:
2893 0 : (*rowImag) = &(warper->dstImagWork.f[imgNum][memOff]);
2894 0 : break;
2895 0 : case CL_SNORM_INT8:
2896 0 : (*rowImag) = &(warper->dstImagWork.c[imgNum][memOff]);
2897 0 : break;
2898 0 : case CL_UNORM_INT8:
2899 0 : (*rowImag) = &(warper->dstImagWork.uc[imgNum][memOff]);
2900 0 : break;
2901 0 : case CL_SNORM_INT16:
2902 0 : (*rowImag) = &(warper->dstImagWork.s[imgNum][memOff]);
2903 0 : break;
2904 0 : case CL_UNORM_INT16:
2905 0 : (*rowImag) = &(warper->dstImagWork.us[imgNum][memOff]);
2906 0 : break;
2907 : }
2908 : }
2909 :
2910 0 : return CL_SUCCESS;
2911 : }
2912 :
2913 : /*
2914 : Free the OpenCL warper environment. It should check everything for NULL, so
2915 : be sure to mark free()ed pointers as NULL or it'll be double free()ed.
2916 :
2917 : Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2918 : */
2919 0 : cl_int GDALWarpKernelOpenCL_deleteEnv(struct oclWarper *warper)
2920 : {
2921 : int i;
2922 0 : cl_int err = CL_SUCCESS;
2923 :
2924 0 : for (i = 0; i < warper->numImages; ++i)
2925 : {
2926 : // Run free!!
2927 0 : void *dummy = nullptr;
2928 0 : if (warper->realWork.v)
2929 0 : freeCLMem(warper->realWorkCL[i], warper->realWork.v[i]);
2930 : else
2931 0 : freeCLMem(warper->realWorkCL[i], dummy);
2932 0 : if (warper->realWork.v)
2933 0 : freeCLMem(warper->dstRealWorkCL[i], warper->dstRealWork.v[i]);
2934 : else
2935 0 : freeCLMem(warper->dstRealWorkCL[i], dummy);
2936 :
2937 : //(As applicable)
2938 0 : if (warper->imagWorkCL != nullptr && warper->imagWork.v != nullptr &&
2939 0 : warper->imagWork.v[i] != nullptr)
2940 : {
2941 0 : freeCLMem(warper->imagWorkCL[i], warper->imagWork.v[i]);
2942 : }
2943 0 : if (warper->dstImagWorkCL != nullptr &&
2944 0 : warper->dstImagWork.v != nullptr &&
2945 0 : warper->dstImagWork.v[i] != nullptr)
2946 : {
2947 0 : freeCLMem(warper->dstImagWorkCL[i], warper->dstImagWork.v[i]);
2948 : }
2949 : }
2950 :
2951 : // Free cl_mem
2952 0 : freeCLMem(warper->useBandSrcValidCL, warper->useBandSrcValid);
2953 0 : freeCLMem(warper->nBandSrcValidCL, warper->nBandSrcValid);
2954 0 : freeCLMem(warper->xyWorkCL, warper->xyWork);
2955 0 : freeCLMem(warper->fDstNoDataRealCL, warper->fDstNoDataReal);
2956 :
2957 : // Free pointers to cl_mem*
2958 0 : if (warper->realWorkCL != nullptr)
2959 0 : CPLFree(warper->realWorkCL);
2960 0 : if (warper->dstRealWorkCL != nullptr)
2961 0 : CPLFree(warper->dstRealWorkCL);
2962 :
2963 0 : if (warper->imagWorkCL != nullptr)
2964 0 : CPLFree(warper->imagWorkCL);
2965 0 : if (warper->dstImagWorkCL != nullptr)
2966 0 : CPLFree(warper->dstImagWorkCL);
2967 :
2968 0 : if (warper->realWork.v != nullptr)
2969 0 : CPLFree(warper->realWork.v);
2970 0 : if (warper->dstRealWork.v != nullptr)
2971 0 : CPLFree(warper->dstRealWork.v);
2972 :
2973 0 : if (warper->imagWork.v != nullptr)
2974 0 : CPLFree(warper->imagWork.v);
2975 0 : if (warper->dstImagWork.v != nullptr)
2976 0 : CPLFree(warper->dstImagWork.v);
2977 :
2978 : // Free OpenCL structures
2979 0 : if (warper->kern1 != nullptr)
2980 0 : clReleaseKernel(warper->kern1);
2981 0 : if (warper->kern4 != nullptr)
2982 0 : clReleaseKernel(warper->kern4);
2983 0 : if (warper->queue != nullptr)
2984 0 : clReleaseCommandQueue(warper->queue);
2985 0 : if (warper->context != nullptr)
2986 0 : clReleaseContext(warper->context);
2987 :
2988 0 : CPLFree(warper);
2989 :
2990 0 : return CL_SUCCESS;
2991 : }
2992 :
2993 : #endif /* defined(HAVE_OPENCL) */
|