LCOV - code coverage report
Current view: top level - alg - gdalwarpkernel_opencl.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 0 1045 0.0 %
Date: 2024-05-04 12:52:34 Functions: 0 24 0.0 %

          Line data    Source code
       1             : /******************************************************************************
       2             :  * $Id$
       3             :  *
       4             :  * Project:  OpenCL Image Reprojector
       5             :  * Purpose:  Implementation of the GDALWarpKernel reprojector in OpenCL.
       6             :  * Author:   Seth Price, seth@pricepages.org
       7             :  *
       8             :  ******************************************************************************
       9             :  * Copyright (c) 2010, Seth Price <seth@pricepages.org>
      10             :  * Copyright (c) 2010-2012, Even Rouault <even dot rouault at spatialys.com>
      11             :  *
      12             :  * Permission is hereby granted, free of charge, to any person obtaining a
      13             :  * copy of this software and associated documentation files (the "Software"),
      14             :  * to deal in the Software without restriction, including without limitation
      15             :  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      16             :  * and/or sell copies of the Software, and to permit persons to whom the
      17             :  * Software is furnished to do so, subject to the following conditions:
      18             :  *
      19             :  * The above copyright notice and this permission notice shall be included
      20             :  * in all copies or substantial portions of the Software.
      21             :  *
      22             :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
      23             :  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
      24             :  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
      25             :  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
      26             :  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
      27             :  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
      28             :  * DEALINGS IN THE SOFTWARE.
      29             :  ****************************************************************************/
      30             : 
      31             : #include <algorithm>
      32             : #if defined(HAVE_OPENCL)
      33             : 
      34             : /* The following line may be uncommented for increased debugging traces and
      35             :  * profiling */
      36             : /* #define DEBUG_OPENCL 1 */
      37             : 
      38             : #include <assert.h>
      39             : #include <stdlib.h>
      40             : #include <stdio.h>
      41             : #include <limits.h>
      42             : #include <float.h>
      43             : #include <limits>
      44             : #include <vector>
      45             : #include "cpl_string.h"
      46             : #include "gdalwarpkernel_opencl.h"
      47             : 
      48             : #define handleErr(err)                                                         \
      49             :     do                                                                         \
      50             :     {                                                                          \
      51             :         if ((err) != CL_SUCCESS)                                               \
      52             :         {                                                                      \
      53             :             CPLError(CE_Failure, CPLE_AppDefined,                              \
      54             :                      "Error at file %s line %d: %s", __FILE__, __LINE__,       \
      55             :                      getCLErrorString(err));                                   \
      56             :             return err;                                                        \
      57             :         }                                                                      \
      58             :     } while (0)
      59             : 
      60             : #define handleErrRetNULL(err)                                                  \
      61             :     do                                                                         \
      62             :     {                                                                          \
      63             :         if ((err) != CL_SUCCESS)                                               \
      64             :         {                                                                      \
      65             :             (*clErr) = err;                                                    \
      66             :             CPLError(CE_Failure, CPLE_AppDefined,                              \
      67             :                      "Error at file %s line %d: %s", __FILE__, __LINE__,       \
      68             :                      getCLErrorString(err));                                   \
      69             :             return nullptr;                                                    \
      70             :         }                                                                      \
      71             :     } while (0)
      72             : 
      73             : #define handleErrGoto(err, goto_label)                                         \
      74             :     do                                                                         \
      75             :     {                                                                          \
      76             :         if ((err) != CL_SUCCESS)                                               \
      77             :         {                                                                      \
      78             :             (*clErr) = err;                                                    \
      79             :             CPLError(CE_Failure, CPLE_AppDefined,                              \
      80             :                      "Error at file %s line %d: %s", __FILE__, __LINE__,       \
      81             :                      getCLErrorString(err));                                   \
      82             :             goto goto_label;                                                   \
      83             :         }                                                                      \
      84             :     } while (0)
      85             : 
      86             : #define freeCLMem(clMem, fallBackMem)                                          \
      87             :     do                                                                         \
      88             :     {                                                                          \
      89             :         if ((clMem) != nullptr)                                                \
      90             :         {                                                                      \
      91             :             handleErr(err = clReleaseMemObject(clMem));                        \
      92             :             clMem = nullptr;                                                   \
      93             :             fallBackMem = nullptr;                                             \
      94             :         }                                                                      \
      95             :         else if ((fallBackMem) != nullptr)                                     \
      96             :         {                                                                      \
      97             :             CPLFree(fallBackMem);                                              \
      98             :             fallBackMem = nullptr;                                             \
      99             :         }                                                                      \
     100             :     } while (false)
     101             : 
     102           0 : static const char *getCLErrorString(cl_int err)
     103             : {
     104           0 :     switch (err)
     105             :     {
     106           0 :         case CL_SUCCESS:
     107           0 :             return ("CL_SUCCESS");
     108             :             break;
     109           0 :         case CL_DEVICE_NOT_FOUND:
     110           0 :             return ("CL_DEVICE_NOT_FOUND");
     111             :             break;
     112           0 :         case CL_DEVICE_NOT_AVAILABLE:
     113           0 :             return ("CL_DEVICE_NOT_AVAILABLE");
     114             :             break;
     115           0 :         case CL_COMPILER_NOT_AVAILABLE:
     116           0 :             return ("CL_COMPILER_NOT_AVAILABLE");
     117             :             break;
     118           0 :         case CL_MEM_OBJECT_ALLOCATION_FAILURE:
     119           0 :             return ("CL_MEM_OBJECT_ALLOCATION_FAILURE");
     120             :             break;
     121           0 :         case CL_OUT_OF_RESOURCES:
     122           0 :             return ("CL_OUT_OF_RESOURCES");
     123             :             break;
     124           0 :         case CL_OUT_OF_HOST_MEMORY:
     125           0 :             return ("CL_OUT_OF_HOST_MEMORY");
     126             :             break;
     127           0 :         case CL_PROFILING_INFO_NOT_AVAILABLE:
     128           0 :             return ("CL_PROFILING_INFO_NOT_AVAILABLE");
     129             :             break;
     130           0 :         case CL_MEM_COPY_OVERLAP:
     131           0 :             return ("CL_MEM_COPY_OVERLAP");
     132             :             break;
     133           0 :         case CL_IMAGE_FORMAT_MISMATCH:
     134           0 :             return ("CL_IMAGE_FORMAT_MISMATCH");
     135             :             break;
     136           0 :         case CL_IMAGE_FORMAT_NOT_SUPPORTED:
     137           0 :             return ("CL_IMAGE_FORMAT_NOT_SUPPORTED");
     138             :             break;
     139           0 :         case CL_BUILD_PROGRAM_FAILURE:
     140           0 :             return ("CL_BUILD_PROGRAM_FAILURE");
     141             :             break;
     142           0 :         case CL_MAP_FAILURE:
     143           0 :             return ("CL_MAP_FAILURE");
     144             :             break;
     145           0 :         case CL_INVALID_VALUE:
     146           0 :             return ("CL_INVALID_VALUE");
     147             :             break;
     148           0 :         case CL_INVALID_DEVICE_TYPE:
     149           0 :             return ("CL_INVALID_DEVICE_TYPE");
     150             :             break;
     151           0 :         case CL_INVALID_PLATFORM:
     152           0 :             return ("CL_INVALID_PLATFORM");
     153             :             break;
     154           0 :         case CL_INVALID_DEVICE:
     155           0 :             return ("CL_INVALID_DEVICE");
     156             :             break;
     157           0 :         case CL_INVALID_CONTEXT:
     158           0 :             return ("CL_INVALID_CONTEXT");
     159             :             break;
     160           0 :         case CL_INVALID_QUEUE_PROPERTIES:
     161           0 :             return ("CL_INVALID_QUEUE_PROPERTIES");
     162             :             break;
     163           0 :         case CL_INVALID_COMMAND_QUEUE:
     164           0 :             return ("CL_INVALID_COMMAND_QUEUE");
     165             :             break;
     166           0 :         case CL_INVALID_HOST_PTR:
     167           0 :             return ("CL_INVALID_HOST_PTR");
     168             :             break;
     169           0 :         case CL_INVALID_MEM_OBJECT:
     170           0 :             return ("CL_INVALID_MEM_OBJECT");
     171             :             break;
     172           0 :         case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
     173           0 :             return ("CL_INVALID_IMAGE_FORMAT_DESCRIPTOR");
     174             :             break;
     175           0 :         case CL_INVALID_IMAGE_SIZE:
     176           0 :             return ("CL_INVALID_IMAGE_SIZE");
     177             :             break;
     178           0 :         case CL_INVALID_SAMPLER:
     179           0 :             return ("CL_INVALID_SAMPLER");
     180             :             break;
     181           0 :         case CL_INVALID_BINARY:
     182           0 :             return ("CL_INVALID_BINARY");
     183             :             break;
     184           0 :         case CL_INVALID_BUILD_OPTIONS:
     185           0 :             return ("CL_INVALID_BUILD_OPTIONS");
     186             :             break;
     187           0 :         case CL_INVALID_PROGRAM:
     188           0 :             return ("CL_INVALID_PROGRAM");
     189             :             break;
     190           0 :         case CL_INVALID_PROGRAM_EXECUTABLE:
     191           0 :             return ("CL_INVALID_PROGRAM_EXECUTABLE");
     192             :             break;
     193           0 :         case CL_INVALID_KERNEL_NAME:
     194           0 :             return ("CL_INVALID_KERNEL_NAME");
     195             :             break;
     196           0 :         case CL_INVALID_KERNEL_DEFINITION:
     197           0 :             return ("CL_INVALID_KERNEL_DEFINITION");
     198             :             break;
     199           0 :         case CL_INVALID_KERNEL:
     200           0 :             return ("CL_INVALID_KERNEL");
     201             :             break;
     202           0 :         case CL_INVALID_ARG_INDEX:
     203           0 :             return ("CL_INVALID_ARG_INDEX");
     204             :             break;
     205           0 :         case CL_INVALID_ARG_VALUE:
     206           0 :             return ("CL_INVALID_ARG_VALUE");
     207             :             break;
     208           0 :         case CL_INVALID_ARG_SIZE:
     209           0 :             return ("CL_INVALID_ARG_SIZE");
     210             :             break;
     211           0 :         case CL_INVALID_KERNEL_ARGS:
     212           0 :             return ("CL_INVALID_KERNEL_ARGS");
     213             :             break;
     214           0 :         case CL_INVALID_WORK_DIMENSION:
     215           0 :             return ("CL_INVALID_WORK_DIMENSION");
     216             :             break;
     217           0 :         case CL_INVALID_WORK_GROUP_SIZE:
     218           0 :             return ("CL_INVALID_WORK_GROUP_SIZE");
     219             :             break;
     220           0 :         case CL_INVALID_WORK_ITEM_SIZE:
     221           0 :             return ("CL_INVALID_WORK_ITEM_SIZE");
     222             :             break;
     223           0 :         case CL_INVALID_GLOBAL_OFFSET:
     224           0 :             return ("CL_INVALID_GLOBAL_OFFSET");
     225             :             break;
     226           0 :         case CL_INVALID_EVENT_WAIT_LIST:
     227           0 :             return ("CL_INVALID_EVENT_WAIT_LIST");
     228             :             break;
     229           0 :         case CL_INVALID_EVENT:
     230           0 :             return ("CL_INVALID_EVENT");
     231             :             break;
     232           0 :         case CL_INVALID_OPERATION:
     233           0 :             return ("CL_INVALID_OPERATION");
     234             :             break;
     235           0 :         case CL_INVALID_GL_OBJECT:
     236           0 :             return ("CL_INVALID_GL_OBJECT");
     237             :             break;
     238           0 :         case CL_INVALID_BUFFER_SIZE:
     239           0 :             return ("CL_INVALID_BUFFER_SIZE");
     240             :             break;
     241           0 :         case CL_INVALID_MIP_LEVEL:
     242           0 :             return ("CL_INVALID_MIP_LEVEL");
     243             :             break;
     244           0 :         case CL_INVALID_GLOBAL_WORK_SIZE:
     245           0 :             return ("CL_INVALID_GLOBAL_WORK_SIZE");
     246             :             break;
     247             :     }
     248             : 
     249           0 :     return "unknown_error";
     250             : }
     251             : 
     252           0 : static const char *getCLDataTypeString(cl_channel_type dataType)
     253             : {
     254           0 :     switch (dataType)
     255             :     {
     256           0 :         case CL_SNORM_INT8:
     257           0 :             return "CL_SNORM_INT8";
     258           0 :         case CL_SNORM_INT16:
     259           0 :             return "CL_SNORM_INT16";
     260           0 :         case CL_UNORM_INT8:
     261           0 :             return "CL_UNORM_INT8";
     262           0 :         case CL_UNORM_INT16:
     263           0 :             return "CL_UNORM_INT16";
     264             : #if 0
     265             :         case CL_UNORM_SHORT_565: return "CL_UNORM_SHORT_565";
     266             :         case CL_UNORM_SHORT_555: return "CL_UNORM_SHORT_555";
     267             :         case CL_UNORM_INT_101010: return "CL_UNORM_INT_101010";
     268             :         case CL_SIGNED_INT8: return "CL_SIGNED_INT8";
     269             :         case CL_SIGNED_INT16: return "CL_SIGNED_INT16";
     270             :         case CL_SIGNED_INT32: return "CL_SIGNED_INT32";
     271             :         case CL_UNSIGNED_INT8: return "CL_UNSIGNED_INT8";
     272             :         case CL_UNSIGNED_INT16: return "CL_UNSIGNED_INT16";
     273             :         case CL_UNSIGNED_INT32: return "CL_UNSIGNED_INT32";
     274             :         case CL_HALF_FLOAT: return "CL_HALF_FLOAT";
     275             : #endif
     276           0 :         case CL_FLOAT:
     277           0 :             return "CL_FLOAT";
     278           0 :         default:
     279           0 :             return "unknown";
     280             :     }
     281             : }
     282             : 
     283             : /*
     284             :  Finds an appropriate OpenCL device. For debugging, it's
     285             :  always easier to use CL_DEVICE_TYPE_CPU because then */
     286             : /*ok*/ /*printf() can be called
     287             : from the kernel. If debugging is on, we can print the name and stats about the
     288             : device we're using.
     289             : */
     290           0 : static cl_device_id get_device(OCLVendor *peVendor)
     291             : {
     292           0 :     cl_int err = 0;
     293           0 :     size_t returned_size = 0;
     294           0 :     cl_char vendor_name[1024] = {0};
     295           0 :     cl_char device_name[1024] = {0};
     296             :     cl_device_type device_type;
     297             : 
     298           0 :     std::vector<cl_platform_id> platforms;
     299           0 :     cl_uint num_platforms = 0;
     300             : 
     301             :     static bool gbBuggyOpenCL = false;
     302           0 :     if (gbBuggyOpenCL)
     303           0 :         return nullptr;
     304             :     try
     305             :     {
     306           0 :         err = clGetPlatformIDs(0, nullptr, &num_platforms);
     307           0 :         if (err != CL_SUCCESS || num_platforms == 0)
     308           0 :             return nullptr;
     309             : 
     310           0 :         platforms.resize(num_platforms);
     311           0 :         err = clGetPlatformIDs(num_platforms, &platforms[0], nullptr);
     312           0 :         if (err != CL_SUCCESS)
     313           0 :             return nullptr;
     314             :     }
     315           0 :     catch (...)
     316             :     {
     317           0 :         gbBuggyOpenCL = true;
     318           0 :         CPLDebug("OpenCL", "clGetPlatformIDs() threw a C++ exception");
     319             :         // This should normally not happen. But that does happen with
     320             :         // intel-opencl 0r2.0-54426 when run under xvfb-run
     321           0 :         return nullptr;
     322             :     }
     323             : 
     324             :     const bool bUseOpenCLCPU =
     325           0 :         CPLTestBool(CPLGetConfigOption("OPENCL_USE_CPU", "FALSE"));
     326             : 
     327             :     struct device_info
     328             :     {
     329             :         cl_device_id id;
     330             :         cl_device_type device_type;
     331             :         std::string vendor_name;
     332             :         std::string device_name;
     333             :     };
     334             : 
     335           0 :     std::vector<device_info> openCLdevices;
     336             : 
     337             :     // In case we have several implementations, pick up the non Intel one by
     338             :     // default, unless the PREFERRED_OPENCL_VENDOR config option is specified.
     339           0 :     if (num_platforms > 0)
     340             :     {
     341             :         const char *pszBlacklistedVendor =
     342           0 :             CPLGetConfigOption("BLACKLISTED_OPENCL_VENDOR", nullptr);
     343             : 
     344           0 :         for (cl_uint platformIndex = 0; platformIndex < num_platforms;
     345             :              platformIndex++)
     346             :         {
     347             : 
     348           0 :             constexpr int max_devices{20};
     349             :             cl_uint num_devices;
     350             :             cl_device_id devices[max_devices];
     351           0 :             err = clGetDeviceIDs(platforms[platformIndex],
     352             :                                  bUseOpenCLCPU
     353             :                                      ? CL_DEVICE_TYPE_CPU
     354             :                                      : CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU,
     355             :                                  max_devices, devices, &num_devices);
     356             : 
     357           0 :             if (err != CL_SUCCESS)
     358             :             {
     359           0 :                 continue;
     360             :             }
     361             : 
     362           0 :             cl_device_id device = nullptr;
     363             : 
     364           0 :             for (cl_uint deviceIndex = 0; deviceIndex < num_devices;
     365             :                  ++deviceIndex)
     366             :             {
     367             : 
     368           0 :                 device = devices[deviceIndex];
     369           0 :                 err = clGetDeviceInfo(device, CL_DEVICE_VENDOR,
     370             :                                       sizeof(vendor_name), vendor_name,
     371             :                                       &returned_size);
     372           0 :                 err |=
     373           0 :                     clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name),
     374             :                                     device_name, &returned_size);
     375           0 :                 err |=
     376           0 :                     clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(device_type),
     377             :                                     &device_type, &returned_size);
     378           0 :                 assert(err == CL_SUCCESS);
     379           0 :                 if (pszBlacklistedVendor &&
     380           0 :                     EQUAL(reinterpret_cast<const char *>(vendor_name),
     381             :                           pszBlacklistedVendor))
     382             :                 {
     383           0 :                     CPLDebug("OpenCL",
     384             :                              "Blacklisted vendor='%s' / device='%s' "
     385             :                              "implementation skipped",
     386             :                              vendor_name, device_name);
     387             :                 }
     388             :                 else
     389             :                 {
     390           0 :                     CPLDebug(
     391             :                         "OpenCL",
     392             :                         "Found vendor='%s' / device='%s' (%s implementation).",
     393             :                         vendor_name, device_name,
     394           0 :                         (device_type == CL_DEVICE_TYPE_GPU) ? "GPU" : "CPU");
     395           0 :                     openCLdevices.push_back(
     396             :                         {device, device_type,
     397             :                          reinterpret_cast<const char *>(vendor_name),
     398             :                          reinterpret_cast<const char *>(device_name)});
     399             :                 }
     400             :             }
     401             :         }
     402             :     }
     403             : 
     404           0 :     if (!openCLdevices.empty())
     405             :     {
     406             :         // Sort, GPU first, then preferred vendor, Intel comes last (unless is the preferred of course)
     407             :         const std::string preferredVendorName{
     408           0 :             CPLGetConfigOption("PREFERRED_OPENCL_VENDOR", "")};
     409           0 :         std::sort(
     410             :             openCLdevices.begin(), openCLdevices.end(),
     411           0 :             [&](const device_info first, const device_info second) -> int
     412             :             {
     413           0 :                 return (first.device_type == CL_DEVICE_TYPE_GPU ? 4 : 0) +
     414           0 :                            (first.vendor_name == preferredVendorName ? 2 : 0) +
     415           0 :                            (first.vendor_name.find("Intel") == 0 ? 0 : 1) >
     416           0 :                        (second.device_type == CL_DEVICE_TYPE_GPU ? 4 : 0) +
     417           0 :                            (second.vendor_name == preferredVendorName ? 2 : 0) +
     418           0 :                            (second.vendor_name.find("Intel") == 0 ? 0 : 1);
     419             :             });
     420             :     }
     421             :     else
     422             :     {
     423           0 :         CPLDebug("OpenCL", "No implementation found");
     424           0 :         return nullptr;
     425             :     }
     426             : 
     427           0 :     const device_info &device{openCLdevices.front()};
     428             : 
     429           0 :     CPLDebug("OpenCL",
     430             :              "Connected to vendor='%s' / device='%s' (%s implementation).",
     431             :              device.vendor_name.c_str(), device.device_name.c_str(),
     432           0 :              (device.device_type == CL_DEVICE_TYPE_GPU) ? "GPU" : "CPU");
     433             : 
     434           0 :     if (STARTS_WITH(device.vendor_name.c_str(), "Advanced Micro Devices"))
     435           0 :         *peVendor = VENDOR_AMD;
     436           0 :     else if (STARTS_WITH(device.vendor_name.c_str(), "Intel"))
     437           0 :         *peVendor = VENDOR_INTEL;
     438             :     else
     439           0 :         *peVendor = VENDOR_OTHER;
     440             : 
     441           0 :     return device.id;
     442             : }
     443             : 
     444             : /*
     445             :  Given that not all OpenCL devices support the same image formats, we need to
     446             :  make do with what we have. This leads to wasted space, but as OpenCL matures
     447             :  I hope it'll get better.
     448             :  */
     449           0 : static cl_int set_supported_formats(struct oclWarper *warper,
     450             :                                     cl_channel_order minOrderSize,
     451             :                                     cl_channel_order *chosenOrder,
     452             :                                     unsigned int *chosenSize,
     453             :                                     cl_channel_type dataType)
     454             : {
     455             :     cl_image_format *fmtBuf =
     456           0 :         static_cast<cl_image_format *>(calloc(256, sizeof(cl_image_format)));
     457             :     cl_uint numRet;
     458             :     cl_uint i;
     459           0 :     cl_uint extraSpace = 9999;
     460             :     cl_int err;
     461           0 :     int bFound = FALSE;
     462             : 
     463             :     // Find what we *can* handle
     464           0 :     handleErr(err = clGetSupportedImageFormats(
     465             :                   warper->context, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE2D, 256,
     466             :                   fmtBuf, &numRet));
     467           0 :     for (i = 0; i < numRet; ++i)
     468             :     {
     469           0 :         cl_channel_order thisOrderSize = 0;
     470           0 :         switch (fmtBuf[i].image_channel_order)
     471             :         {
     472             :                 // Only support formats which use the channels in order
     473             :                 // (x,y,z,w)
     474           0 :             case CL_R:
     475             :             case CL_INTENSITY:
     476             :             case CL_LUMINANCE:
     477           0 :                 thisOrderSize = 1;
     478           0 :                 break;
     479           0 :             case CL_RG:
     480           0 :                 thisOrderSize = 2;
     481           0 :                 break;
     482           0 :             case CL_RGB:
     483           0 :                 thisOrderSize = 3;
     484           0 :                 break;
     485           0 :             case CL_RGBA:
     486           0 :                 thisOrderSize = 4;
     487           0 :                 break;
     488             :         }
     489             : 
     490             :         // Choose an order with the least wasted space
     491           0 :         if (fmtBuf[i].image_channel_data_type == dataType &&
     492           0 :             minOrderSize <= thisOrderSize &&
     493           0 :             extraSpace > thisOrderSize - minOrderSize)
     494             :         {
     495             : 
     496             :             // Set the vector size, order, & remember wasted space
     497           0 :             (*chosenSize) = thisOrderSize;
     498           0 :             (*chosenOrder) = fmtBuf[i].image_channel_order;
     499           0 :             extraSpace = thisOrderSize - minOrderSize;
     500           0 :             bFound = TRUE;
     501             :         }
     502             :     }
     503             : 
     504           0 :     free(fmtBuf);
     505             : 
     506           0 :     if (!bFound)
     507             :     {
     508           0 :         CPLDebug("OpenCL",
     509             :                  "Cannot find supported format for dataType = %s and "
     510             :                  "minOrderSize = %d",
     511             :                  getCLDataTypeString(dataType), static_cast<int>(minOrderSize));
     512             :     }
     513           0 :     return (bFound) ? CL_SUCCESS : CL_INVALID_OPERATION;
     514             : }
     515             : 
     516             : /*
     517             :  Allocate some pinned memory that we can use as an intermediate buffer. We're
     518             :  using the pinned memory to assemble the data before transferring it to the
     519             :  device. The reason we're using pinned RAM is because the transfer speed from
     520             :  host RAM to device RAM is faster than non-pinned. The disadvantage is that
     521             :  pinned RAM is a scarce OS resource. I'm making the assumption that the user
     522             :  has as much pinned host RAM available as total device RAM because device RAM
     523             :  tends to be similarly scarce. However, if the pinned memory fails we fall back
     524             :  to using a regular memory allocation.
     525             : 
     526             :  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
     527             :  */
     528           0 : static cl_int alloc_pinned_mem(struct oclWarper *warper, int imgNum,
     529             :                                size_t dataSz, void **wrkPtr, cl_mem *wrkCL)
     530             : {
     531           0 :     cl_int err = CL_SUCCESS;
     532           0 :     wrkCL[imgNum] = clCreateBuffer(warper->context,
     533             :                                    CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
     534             :                                    dataSz, nullptr, &err);
     535             : 
     536           0 :     if (err == CL_SUCCESS)
     537             :     {
     538           0 :         wrkPtr[imgNum] = clEnqueueMapBuffer(warper->queue, wrkCL[imgNum],
     539             :                                             CL_FALSE, CL_MAP_WRITE, 0, dataSz,
     540             :                                             0, nullptr, nullptr, &err);
     541           0 :         handleErr(err);
     542             :     }
     543             :     else
     544             :     {
     545           0 :         wrkCL[imgNum] = nullptr;
     546             : #ifdef DEBUG_OPENCL
     547             :         CPLDebug("OpenCL", "Using fallback non-pinned memory!");
     548             : #endif
     549             :         // Fallback to regular allocation
     550           0 :         wrkPtr[imgNum] = VSI_MALLOC_VERBOSE(dataSz);
     551             : 
     552           0 :         if (wrkPtr[imgNum] == nullptr)
     553           0 :             handleErr(err = CL_OUT_OF_HOST_MEMORY);
     554             :     }
     555             : 
     556           0 :     return CL_SUCCESS;
     557             : }
     558             : 
     559             : /*
     560             :  Allocates the working host memory for all bands of the image in the warper
     561             :  structure. This includes both the source image buffers and the destination
     562             :  buffers. This memory is located on the host, so we can assemble the image.
     563             :  Reasons for buffering it like this include reading each row from disk and
     564             :  de-interleaving bands and parts of bands. Then they can be copied to the device
     565             :  as a single operation fit for use as an OpenCL memory object.
     566             : 
     567             :  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
     568             :  */
     569           0 : static cl_int alloc_working_arr(struct oclWarper *warper, size_t ptrSz,
     570             :                                 size_t dataSz, CPL_UNUSED size_t *fmtSz)
     571             : {
     572           0 :     cl_int err = CL_SUCCESS;
     573             :     int i, b;
     574             :     size_t srcDataSz1, dstDataSz1, srcDataSz4, dstDataSz4;
     575           0 :     const int numBands = warper->numBands;
     576             : 
     577             :     // Find the best channel order for this format
     578           0 :     err = set_supported_formats(warper, 1, &(warper->imgChOrder1),
     579             :                                 &(warper->imgChSize1), warper->imageFormat);
     580           0 :     handleErr(err);
     581           0 :     if (warper->useVec)
     582             :     {
     583           0 :         err = set_supported_formats(warper, 4, &(warper->imgChOrder4),
     584             :                                     &(warper->imgChSize4), warper->imageFormat);
     585           0 :         handleErr(err);
     586             :     }
     587             : 
     588             :     // Alloc space for pointers to the main image data
     589           0 :     warper->realWork.v =
     590           0 :         static_cast<void **>(VSI_CALLOC_VERBOSE(ptrSz, warper->numImages));
     591           0 :     warper->dstRealWork.v =
     592           0 :         static_cast<void **>(VSI_CALLOC_VERBOSE(ptrSz, warper->numImages));
     593           0 :     if (warper->realWork.v == nullptr || warper->dstRealWork.v == nullptr)
     594           0 :         handleErr(err = CL_OUT_OF_HOST_MEMORY);
     595             : 
     596           0 :     if (warper->imagWorkCL != nullptr)
     597             :     {
     598             :         // Alloc space for pointers to the extra channel, if it exists
     599           0 :         warper->imagWork.v =
     600           0 :             static_cast<void **>(VSI_CALLOC_VERBOSE(ptrSz, warper->numImages));
     601           0 :         warper->dstImagWork.v =
     602           0 :             static_cast<void **>(VSI_CALLOC_VERBOSE(ptrSz, warper->numImages));
     603           0 :         if (warper->imagWork.v == nullptr || warper->dstImagWork.v == nullptr)
     604           0 :             handleErr(err = CL_OUT_OF_HOST_MEMORY);
     605             :     }
     606             :     else
     607             :     {
     608           0 :         warper->imagWork.v = nullptr;
     609           0 :         warper->dstImagWork.v = nullptr;
     610             :     }
     611             : 
     612             :     // Calc the sizes we need
     613           0 :     srcDataSz1 =
     614           0 :         dataSz * warper->srcWidth * warper->srcHeight * warper->imgChSize1;
     615           0 :     dstDataSz1 = dataSz * warper->dstWidth * warper->dstHeight;
     616           0 :     srcDataSz4 =
     617           0 :         dataSz * warper->srcWidth * warper->srcHeight * warper->imgChSize4;
     618           0 :     dstDataSz4 = dataSz * warper->dstWidth * warper->dstHeight * 4;
     619             : 
     620             :     // Allocate pinned memory for each band's image
     621           0 :     for (b = 0, i = 0; b < numBands && i < warper->numImages; ++i)
     622             :     {
     623           0 :         if (warper->useVec && b < numBands - numBands % 4)
     624             :         {
     625           0 :             handleErr(err = alloc_pinned_mem(warper, i, srcDataSz4,
     626             :                                              warper->realWork.v,
     627             :                                              warper->realWorkCL));
     628             : 
     629           0 :             handleErr(err = alloc_pinned_mem(warper, i, dstDataSz4,
     630             :                                              warper->dstRealWork.v,
     631             :                                              warper->dstRealWorkCL));
     632           0 :             b += 4;
     633             :         }
     634             :         else
     635             :         {
     636           0 :             handleErr(err = alloc_pinned_mem(warper, i, srcDataSz1,
     637             :                                              warper->realWork.v,
     638             :                                              warper->realWorkCL));
     639             : 
     640           0 :             handleErr(err = alloc_pinned_mem(warper, i, dstDataSz1,
     641             :                                              warper->dstRealWork.v,
     642             :                                              warper->dstRealWorkCL));
     643           0 :             ++b;
     644             :         }
     645             :     }
     646             : 
     647           0 :     if (warper->imagWorkCL != nullptr)
     648             :     {
     649             :         // Allocate pinned memory for each band's extra channel, if exists
     650           0 :         for (b = 0, i = 0; b < numBands && i < warper->numImages; ++i)
     651             :         {
     652           0 :             if (warper->useVec && b < numBands - numBands % 4)
     653             :             {
     654           0 :                 handleErr(err = alloc_pinned_mem(warper, i, srcDataSz4,
     655             :                                                  warper->imagWork.v,
     656             :                                                  warper->imagWorkCL));
     657             : 
     658           0 :                 handleErr(err = alloc_pinned_mem(warper, i, dstDataSz4,
     659             :                                                  warper->dstImagWork.v,
     660             :                                                  warper->dstImagWorkCL));
     661           0 :                 b += 4;
     662             :             }
     663             :             else
     664             :             {
     665           0 :                 handleErr(err = alloc_pinned_mem(warper, i, srcDataSz1,
     666             :                                                  warper->imagWork.v,
     667             :                                                  warper->imagWorkCL));
     668             : 
     669           0 :                 handleErr(err = alloc_pinned_mem(warper, i, dstDataSz1,
     670             :                                                  warper->dstImagWork.v,
     671             :                                                  warper->dstImagWorkCL));
     672           0 :                 ++b;
     673             :             }
     674             :         }
     675             :     }
     676             : 
     677           0 :     return CL_SUCCESS;
     678             : }
     679             : 
     680             : /*
     681             :  Assemble and create the kernel. For optimization, portability, and
     682             :  implementation limitation reasons, the program is actually assembled from
     683             :  several strings, then compiled with as many invariants as possible defined by
     684             :  the preprocessor. There is also quite a bit of error-catching code in here
     685             :  because the kernel is where many bugs show up.
     686             : 
     687             :  Returns CL_SUCCESS on success and other CL_* errors in the error buffer when
     688             :  something goes wrong.
     689             :  */
     690           0 : static cl_kernel get_kernel(struct oclWarper *warper, char useVec,
     691             :                             double dfXScale, double dfYScale, double dfXFilter,
     692             :                             double dfYFilter, int nXRadius, int nYRadius,
     693             :                             int nFiltInitX, int nFiltInitY, cl_int *clErr)
     694             : {
     695             :     cl_program program;
     696             :     cl_kernel kernel;
     697           0 :     cl_int err = CL_SUCCESS;
     698           0 :     constexpr int PROGBUF_SIZE = 128000;
     699           0 :     std::string buffer;
     700           0 :     buffer.resize(PROGBUF_SIZE);
     701           0 :     std::string progBuf;
     702           0 :     progBuf.resize(PROGBUF_SIZE);
     703           0 :     float dstMinVal = 0.f, dstMaxVal = 0.0;
     704             : 
     705           0 :     const char *outType = "";
     706           0 :     const char *dUseVec = "";
     707           0 :     const char *dVecf = "float";
     708           0 :     const char *kernGenFuncs = R""""(
     709             : // ********************* General Funcs ********************
     710             : void clampToDst(float fReal,
     711             :                 __global outType *dstPtr,
     712             :                 unsigned int iDstOffset,
     713             :                 __constant float *fDstNoDataReal,
     714             :                 int bandNum);
     715             : void setPixel(__global outType *dstReal,
     716             :                 __global outType *dstImag,
     717             :                 __global float *dstDensity,
     718             :                 __global int *nDstValid,
     719             :                 __constant float *fDstNoDataReal,
     720             :                 const int bandNum,
     721             :                 vecf fDensity, vecf fReal, vecf fImag);
     722             : int getPixel(__read_only image2d_t srcReal,
     723             :                 __read_only image2d_t srcImag,
     724             :                 __global float *fUnifiedSrcDensity,
     725             :                 __global int *nUnifiedSrcValid,
     726             :                 __constant char *useBandSrcValid,
     727             :                 __global int *nBandSrcValid,
     728             :                 const int2 iSrc,
     729             :                 int bandNum,
     730             :                 vecf *fDensity, vecf *fReal, vecf *fImag);
     731             : int isValid(__global float *fUnifiedSrcDensity,
     732             :                 __global int *nUnifiedSrcValid,
     733             :                 float2 fSrcCoords );
     734             : float2 getSrcCoords(__read_only image2d_t srcCoords);
     735             : 
     736             : #ifdef USE_CLAMP_TO_DST_FLOAT
     737             : void clampToDst(float fReal,
     738             :                 __global outType *dstPtr,
     739             :                 unsigned int iDstOffset,
     740             :                 __constant float *fDstNoDataReal,
     741             :                 int bandNum)
     742             : {
     743             :     dstPtr[iDstOffset] = fReal;
     744             : }
     745             : #else
     746             : void clampToDst(float fReal,
     747             :                 __global outType *dstPtr,
     748             :                 unsigned int iDstOffset,
     749             :                 __constant float *fDstNoDataReal,
     750             :                 int bandNum)
     751             : {
     752             :     fReal *= dstMaxVal;
     753             : 
     754             :     if (fReal < dstMinVal)
     755             :         dstPtr[iDstOffset] = (outType)dstMinVal;
     756             :     else if (fReal > dstMaxVal)
     757             :         dstPtr[iDstOffset] = (outType)dstMaxVal;
     758             :     else
     759             :         dstPtr[iDstOffset] = (dstMinVal < 0) ? (outType)floor(fReal + 0.5f) : (outType)(fReal + 0.5f);
     760             : 
     761             :     if (useDstNoDataReal && bandNum >= 0 &&
     762             :         fDstNoDataReal[bandNum] == dstPtr[iDstOffset])
     763             :     {
     764             :         if (dstPtr[iDstOffset] == dstMinVal)
     765             :             dstPtr[iDstOffset] = dstMinVal + 1;
     766             :         else
     767             :             dstPtr[iDstOffset] = dstPtr[iDstOffset] - 1;
     768             :     }
     769             : }
     770             : #endif
     771             : 
     772             : void setPixel(__global outType *dstReal,
     773             :               __global outType *dstImag,
     774             :               __global float *dstDensity,
     775             :               __global int *nDstValid,
     776             :               __constant float *fDstNoDataReal,
     777             :               const int bandNum,
     778             :               vecf fDensity, vecf fReal, vecf fImag)
     779             : {
     780             :     unsigned int iDstOffset = get_global_id(1)*iDstWidth + get_global_id(0);
     781             : 
     782             : #ifdef USE_VEC
     783             :     if (fDensity.x < 0.00001f && fDensity.y < 0.00001f &&
     784             :         fDensity.z < 0.00001f && fDensity.w < 0.00001f ) {
     785             : 
     786             :         fReal = 0.0f;
     787             :         fImag = 0.0f;
     788             : 
     789             :     } else if (fDensity.x < 0.9999f || fDensity.y < 0.9999f ||
     790             :                fDensity.z < 0.9999f || fDensity.w < 0.9999f ) {
     791             :         vecf fDstReal, fDstImag;
     792             :         float fDstDensity;
     793             : 
     794             :         fDstReal.x = dstReal[iDstOffset];
     795             :         fDstReal.y = dstReal[iDstOffset+iDstHeight*iDstWidth];
     796             :         fDstReal.z = dstReal[iDstOffset+iDstHeight*iDstWidth*2];
     797             :         fDstReal.w = dstReal[iDstOffset+iDstHeight*iDstWidth*3];
     798             :         if (useImag) {
     799             :             fDstImag.x = dstImag[iDstOffset];
     800             :             fDstImag.y = dstImag[iDstOffset+iDstHeight*iDstWidth];
     801             :             fDstImag.z = dstImag[iDstOffset+iDstHeight*iDstWidth*2];
     802             :             fDstImag.w = dstImag[iDstOffset+iDstHeight*iDstWidth*3];
     803             :         }
     804             : #else
     805             :     if (fDensity < 0.00001f) {
     806             : 
     807             :         fReal = 0.0f;
     808             :         fImag = 0.0f;
     809             : 
     810             :     } else if (fDensity < 0.9999f) {
     811             :         vecf fDstReal, fDstImag;
     812             :         float fDstDensity;
     813             : 
     814             :         fDstReal = dstReal[iDstOffset];
     815             :         if (useImag)
     816             :             fDstImag = dstImag[iDstOffset];
     817             : #endif
     818             : 
     819             :         if (useDstDensity)
     820             :             fDstDensity = dstDensity[iDstOffset];
     821             :         else if (useDstValid &&
     822             :                  !((nDstValid[iDstOffset>>5] & (0x01 << (iDstOffset & 0x1f))) ))
     823             :             fDstDensity = 0.0f;
     824             :         else
     825             :             fDstDensity = 1.0f;
     826             : 
     827             :         vecf fDstInfluence = (1.0f - fDensity) * fDstDensity;
     828             : 
     829             :         // Density should be checked for <= 0.0 & handled by the calling function
     830             :         fReal = (fReal * fDensity + fDstReal * fDstInfluence) / (fDensity + fDstInfluence);
     831             :         if (useImag)
     832             :             fImag = (fImag * fDensity + fDstImag * fDstInfluence) / (fDensity + fDstInfluence);
     833             :     }
     834             : 
     835             : #ifdef USE_VEC
     836             :     clampToDst(fReal.x, dstReal, iDstOffset, fDstNoDataReal, bandNum);
     837             :     clampToDst(fReal.y, dstReal, iDstOffset+iDstHeight*iDstWidth, fDstNoDataReal, bandNum);
     838             :     clampToDst(fReal.z, dstReal, iDstOffset+iDstHeight*iDstWidth*2, fDstNoDataReal, bandNum);
     839             :     clampToDst(fReal.w, dstReal, iDstOffset+iDstHeight*iDstWidth*3, fDstNoDataReal, bandNum);
     840             :     if (useImag) {
     841             :         clampToDst(fImag.x, dstImag, iDstOffset, fDstNoDataReal, -1);
     842             :         clampToDst(fImag.y, dstImag, iDstOffset+iDstHeight*iDstWidth, fDstNoDataReal, -1);
     843             :         clampToDst(fImag.z, dstImag, iDstOffset+iDstHeight*iDstWidth*2, fDstNoDataReal, -1);
     844             :         clampToDst(fImag.w, dstImag, iDstOffset+iDstHeight*iDstWidth*3, fDstNoDataReal, -1);
     845             :     }
     846             : #else
     847             :     clampToDst(fReal, dstReal, iDstOffset, fDstNoDataReal, bandNum);
     848             :     if (useImag)
     849             :         clampToDst(fImag, dstImag, iDstOffset, fDstNoDataReal, -1);
     850             : #endif
     851             : }
     852             : 
     853             : int getPixel(__read_only image2d_t srcReal,
     854             :              __read_only image2d_t srcImag,
     855             :              __global float *fUnifiedSrcDensity,
     856             :              __global int *nUnifiedSrcValid,
     857             :              __constant char *useBandSrcValid,
     858             :              __global int *nBandSrcValid,
     859             :              const int2 iSrc,
     860             :              int bandNum,
     861             :              vecf *fDensity, vecf *fReal, vecf *fImag)
     862             : {
     863             :     int iSrcOffset = 0, iBandValidLen = 0, iSrcOffsetMask = 0;
     864             :     int bHasValid = FALSE;
     865             : 
     866             :     // Clamp the src offset values if needed
     867             :     if(useUnifiedSrcDensity | useUnifiedSrcValid | useUseBandSrcValid){
     868             :         int iSrcX = iSrc.x;
     869             :         int iSrcY = iSrc.y;
     870             : 
     871             :         // Needed because the offset isn't clamped in OpenCL hardware
     872             :         if(iSrcX < 0)
     873             :             iSrcX = 0;
     874             :         else if(iSrcX >= iSrcWidth)
     875             :             iSrcX = iSrcWidth - 1;
     876             : 
     877             :         if(iSrcY < 0)
     878             :             iSrcY = 0;
     879             :         else if(iSrcY >= iSrcHeight)
     880             :             iSrcY = iSrcHeight - 1;
     881             : 
     882             :         iSrcOffset = iSrcY*iSrcWidth + iSrcX;
     883             :         iBandValidLen = 1 + ((iSrcWidth*iSrcHeight)>>5);
     884             :         iSrcOffsetMask = (0x01 << (iSrcOffset & 0x1f));
     885             :     }
     886             : 
     887             :     if (useUnifiedSrcValid &&
     888             :         !((nUnifiedSrcValid[iSrcOffset>>5] & iSrcOffsetMask) ) )
     889             :         return FALSE;
     890             : 
     891             : #ifdef USE_VEC
     892             :     if (!useUseBandSrcValid || !useBandSrcValid[bandNum] ||
     893             :         ((nBandSrcValid[(iSrcOffset>>5)+iBandValidLen*bandNum    ] & iSrcOffsetMask)) )
     894             :         bHasValid = TRUE;
     895             : 
     896             :     if (!useUseBandSrcValid || !useBandSrcValid[bandNum+1] ||
     897             :         ((nBandSrcValid[(iSrcOffset>>5)+iBandValidLen*(1+bandNum)] & iSrcOffsetMask)) )
     898             :         bHasValid = TRUE;
     899             : 
     900             :     if (!useUseBandSrcValid || !useBandSrcValid[bandNum+2] ||
     901             :         ((nBandSrcValid[(iSrcOffset>>5)+iBandValidLen*(2+bandNum)] & iSrcOffsetMask)) )
     902             :         bHasValid = TRUE;
     903             : 
     904             :     if (!useUseBandSrcValid || !useBandSrcValid[bandNum+3] ||
     905             :         ((nBandSrcValid[(iSrcOffset>>5)+iBandValidLen*(3+bandNum)] & iSrcOffsetMask)) )
     906             :         bHasValid = TRUE;
     907             : #else
     908             :     if (!useUseBandSrcValid || !useBandSrcValid[bandNum] ||
     909             :         ((nBandSrcValid[(iSrcOffset>>5)+iBandValidLen*bandNum    ] & iSrcOffsetMask)) )
     910             :         bHasValid = TRUE;
     911             : #endif
     912             : 
     913             :     if (!bHasValid)
     914             :         return FALSE;
     915             : 
     916             :     const sampler_t samp =  CLK_NORMALIZED_COORDS_FALSE |
     917             :                             CLK_ADDRESS_CLAMP_TO_EDGE |
     918             :                             CLK_FILTER_NEAREST;
     919             : 
     920             : #ifdef USE_VEC
     921             :     (*fReal) = read_imagef(srcReal, samp, iSrc);
     922             :     if (useImag)
     923             :         (*fImag) = read_imagef(srcImag, samp, iSrc);
     924             : #else
     925             :     (*fReal) = read_imagef(srcReal, samp, iSrc).x;
     926             :     if (useImag)
     927             :         (*fImag) = read_imagef(srcImag, samp, iSrc).x;
     928             : #endif
     929             : 
     930             :     if (useUnifiedSrcDensity) {
     931             :         (*fDensity) = fUnifiedSrcDensity[iSrcOffset];
     932             :     } else {
     933             :         (*fDensity) = 1.0f;
     934             :         return TRUE;
     935             :     }
     936             : 
     937             : #ifdef USE_VEC
     938             :     return  (*fDensity).x > 0.0000001f || (*fDensity).y > 0.0000001f ||
     939             :             (*fDensity).z > 0.0000001f || (*fDensity).w > 0.0000001f;
     940             : #else
     941             :     return (*fDensity) > 0.0000001f;
     942             : #endif
     943             : }
     944             : 
     945             : int isValid(__global float *fUnifiedSrcDensity,
     946             :             __global int *nUnifiedSrcValid,
     947             :             float2 fSrcCoords )
     948             : {
     949             :     if (fSrcCoords.x < 0.0f || fSrcCoords.y < 0.0f)
     950             :         return FALSE;
     951             : 
     952             :     int iSrcX = (int) (fSrcCoords.x - 0.5f);
     953             :     int iSrcY = (int) (fSrcCoords.y - 0.5f);
     954             : 
     955             :     if( iSrcX < 0 || iSrcX >= iSrcWidth || iSrcY < 0 || iSrcY >= iSrcHeight )
     956             :         return FALSE;
     957             : 
     958             :     int iSrcOffset = iSrcX + iSrcY * iSrcWidth;
     959             : 
     960             :     if (useUnifiedSrcDensity && fUnifiedSrcDensity[iSrcOffset] < 0.00001f)
     961             :         return FALSE;
     962             : 
     963             :     if (useUnifiedSrcValid &&
     964             :         !(nUnifiedSrcValid[iSrcOffset>>5] & (0x01 << (iSrcOffset & 0x1f))) )
     965             :         return FALSE;
     966             : 
     967             :     return TRUE;
     968             : }
     969             : 
     970             : float2 getSrcCoords(__read_only image2d_t srcCoords)
     971             : {
     972             :     // Find an appropriate place to sample the coordinates so we're still
     973             :     // accurate after linear interpolation.
     974             :     int nDstX = get_global_id(0);
     975             :     int nDstY = get_global_id(1);
     976             :     float2  fDst = (float2)((0.5f * (float)iCoordMult + nDstX) /
     977             :                                 (float)((ceil((iDstWidth  - 1) / (float)iCoordMult) + 1) * iCoordMult),
     978             :                             (0.5f * (float)iCoordMult + nDstY) /
     979             :                                 (float)((ceil((iDstHeight - 1) / (float)iCoordMult) + 1) * iCoordMult));
     980             : 
     981             :     // Check & return when the thread group overruns the image size
     982             :     if (nDstX >= iDstWidth || nDstY >= iDstHeight)
     983             :         return (float2)(-99.0f, -99.0f);
     984             : 
     985             :     const sampler_t samp =  CLK_NORMALIZED_COORDS_TRUE |
     986             :                             CLK_ADDRESS_CLAMP_TO_EDGE |
     987             :                             CLK_FILTER_LINEAR;
     988             : 
     989             :     float4  fSrcCoords = read_imagef(srcCoords,samp,fDst);
     990             : 
     991             :     return (float2)(fSrcCoords.x, fSrcCoords.y);
     992             : }
     993             : )"""";
     994             : 
     995           0 :     const char *kernBilinear = R""""(
     996             : // ************************ Bilinear ************************
     997             : __kernel void resamp(__read_only image2d_t srcCoords,
     998             :                     __read_only image2d_t srcReal,
     999             :                     __read_only image2d_t srcImag,
    1000             :                     __global float *fUnifiedSrcDensity,
    1001             :                     __global int *nUnifiedSrcValid,
    1002             :                     __constant char *useBandSrcValid,
    1003             :                     __global int *nBandSrcValid,
    1004             :                     __global outType *dstReal,
    1005             :                     __global outType *dstImag,
    1006             :                     __constant float *fDstNoDataReal,
    1007             :                     __global float *dstDensity,
    1008             :                     __global int *nDstValid,
    1009             :                     const int bandNum)
    1010             : {
    1011             :     float2  fSrc = getSrcCoords(srcCoords);
    1012             :     if (!isValid(fUnifiedSrcDensity, nUnifiedSrcValid, fSrc))
    1013             :         return;
    1014             : 
    1015             :     int     iSrcX = (int) floor(fSrc.x - 0.5f);
    1016             :     int     iSrcY = (int) floor(fSrc.y - 0.5f);
    1017             :     float   fRatioX = 1.5f - (fSrc.x - iSrcX);
    1018             :     float   fRatioY = 1.5f - (fSrc.y - iSrcY);
    1019             :     vecf    fReal, fImag, fDens;
    1020             :     vecf    fAccumulatorReal = 0.0f, fAccumulatorImag = 0.0f;
    1021             :     vecf    fAccumulatorDensity = 0.0f;
    1022             :     float   fAccumulatorDivisor = 0.0f;
    1023             : 
    1024             :     if ( iSrcY >= 0 && iSrcY < iSrcHeight ) {
    1025             :         float fMult1 = fRatioX * fRatioY;
    1026             :         float fMult2 = (1.0f-fRatioX) * fRatioY;
    1027             : 
    1028             :                 // Upper Left Pixel
    1029             :                 if ( iSrcX >= 0 && iSrcX < iSrcWidth
    1030             :                          && getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
    1031             :                                                 useBandSrcValid, nBandSrcValid, (int2)(iSrcX, iSrcY),
    1032             :                                                 bandNum, &fDens, &fReal, &fImag))
    1033             :                 {
    1034             :                         fAccumulatorDivisor += fMult1;
    1035             :                         fAccumulatorReal += fReal * fMult1;
    1036             :                         fAccumulatorImag += fImag * fMult1;
    1037             :                         fAccumulatorDensity += fDens * fMult1;
    1038             :                 }
    1039             : 
    1040             :                 // Upper Right Pixel
    1041             :                 if ( iSrcX+1 >= 0 && iSrcX+1 < iSrcWidth
    1042             :                         && getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
    1043             :                                                 useBandSrcValid, nBandSrcValid, (int2)(iSrcX+1, iSrcY),
    1044             :                                                 bandNum, &fDens, &fReal, &fImag))
    1045             :                 {
    1046             :                         fAccumulatorDivisor += fMult2;
    1047             :                         fAccumulatorReal += fReal * fMult2;
    1048             :                         fAccumulatorImag += fImag * fMult2;
    1049             :                         fAccumulatorDensity += fDens * fMult2;
    1050             :                 }
    1051             :     }
    1052             : 
    1053             :     if ( iSrcY+1 >= 0 && iSrcY+1 < iSrcHeight ) {
    1054             :         float fMult1 = fRatioX * (1.0f-fRatioY);
    1055             :         float fMult2 = (1.0f-fRatioX) * (1.0f-fRatioY);
    1056             : 
    1057             :         // Lower Left Pixel
    1058             :                 if ( iSrcX >= 0 && iSrcX < iSrcWidth
    1059             :                         && getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
    1060             :                                                 useBandSrcValid, nBandSrcValid, (int2)(iSrcX, iSrcY+1),
    1061             :                                                 bandNum, &fDens, &fReal, &fImag))
    1062             :                 {
    1063             :                         fAccumulatorDivisor += fMult1;
    1064             :                         fAccumulatorReal += fReal * fMult1;
    1065             :                         fAccumulatorImag += fImag * fMult1;
    1066             :                         fAccumulatorDensity += fDens * fMult1;
    1067             :                 }
    1068             : 
    1069             :                 // Lower Right Pixel
    1070             :                 if ( iSrcX+1 >= 0 && iSrcX+1 < iSrcWidth
    1071             :                         && getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
    1072             :                                                 useBandSrcValid, nBandSrcValid, (int2)(iSrcX+1, iSrcY+1),
    1073             :                                                 bandNum, &fDens, &fReal, &fImag))
    1074             :                 {
    1075             :                         fAccumulatorDivisor += fMult2;
    1076             :                         fAccumulatorReal += fReal * fMult2;
    1077             :                         fAccumulatorImag += fImag * fMult2;
    1078             :                         fAccumulatorDensity += fDens * fMult2;
    1079             :                 }
    1080             :     }
    1081             : 
    1082             :     // Compute and save final pixel
    1083             :     if ( fAccumulatorDivisor < 0.00001f ) {
    1084             :         setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
    1085             :                 0.0f, 0.0f, 0.0f );
    1086             :     } else if ( fAccumulatorDivisor < 0.99999f || fAccumulatorDivisor > 1.00001f ) {
    1087             :         setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
    1088             :                 fAccumulatorDensity / fAccumulatorDivisor,
    1089             :                 fAccumulatorReal / fAccumulatorDivisor,
    1090             : #if useImag != 0
    1091             :                 fAccumulatorImag / fAccumulatorDivisor );
    1092             : #else
    1093             :                 0.0f );
    1094             : #endif
    1095             :     } else {
    1096             :         setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
    1097             :                 fAccumulatorDensity, fAccumulatorReal, fAccumulatorImag );
    1098             :     }
    1099             : }
    1100             : )"""";
    1101             : 
    1102           0 :     const char *kernCubic = R""""(
    1103             : // ************************ Cubic ************************
    1104             : vecf cubicConvolution(float dist1, float dist2, float dist3,
    1105             :                         vecf f0, vecf f1, vecf f2, vecf f3);
    1106             : 
    1107             : vecf cubicConvolution(float dist1, float dist2, float dist3,
    1108             :                        vecf f0, vecf f1, vecf f2, vecf f3)
    1109             : {
    1110             :    return (  f1
    1111             :        + 0.5f * (dist1*(f2 - f0)
    1112             :                + dist2*(2.0f*f0 - 5.0f*f1 + 4.0f*f2 - f3)
    1113             :                + dist3*(3.0f*(f1 - f2) + f3 - f0)));
    1114             : }
    1115             : 
    1116             : // ************************ Cubic ************************
    1117             : __kernel void resamp(__read_only image2d_t srcCoords,
    1118             :                      __read_only image2d_t srcReal,
    1119             :                      __read_only image2d_t srcImag,
    1120             :                      __global float *fUnifiedSrcDensity,
    1121             :                      __global int *nUnifiedSrcValid,
    1122             :                      __constant char *useBandSrcValid,
    1123             :                      __global int *nBandSrcValid,
    1124             :                      __global outType *dstReal,
    1125             :                      __global outType *dstImag,
    1126             :                      __constant float *fDstNoDataReal,
    1127             :                      __global float *dstDensity,
    1128             :                      __global int *nDstValid,
    1129             :                      const int bandNum)
    1130             : {
    1131             :     int i;
    1132             :     float2  fSrc = getSrcCoords(srcCoords);
    1133             : 
    1134             :     if (!isValid(fUnifiedSrcDensity, nUnifiedSrcValid, fSrc))
    1135             :         return;
    1136             : 
    1137             :     int     iSrcX = (int) floor( fSrc.x - 0.5f );
    1138             :     int     iSrcY = (int) floor( fSrc.y - 0.5f );
    1139             :     float   fDeltaX = fSrc.x - 0.5f - (float)iSrcX;
    1140             :     float   fDeltaY = fSrc.y - 0.5f - (float)iSrcY;
    1141             :     float   fDeltaX2 = fDeltaX * fDeltaX;
    1142             :     float   fDeltaY2 = fDeltaY * fDeltaY;
    1143             :     float   fDeltaX3 = fDeltaX2 * fDeltaX;
    1144             :     float   fDeltaY3 = fDeltaY2 * fDeltaY;
    1145             :     vecf    afReal[4], afDens[4];
    1146             : #if useImag != 0
    1147             :     vecf    afImag[4];
    1148             : #else
    1149             :     vecf    fImag = 0.0f;
    1150             : #endif
    1151             : 
    1152             :     // Loop over rows
    1153             :     for (i = -1; i < 3; ++i)
    1154             :     {
    1155             :         vecf    fReal1 = 0.0f, fReal2 = 0.0f, fReal3 = 0.0f, fReal4 = 0.0f;
    1156             :         vecf    fDens1 = 0.0f, fDens2 = 0.0f, fDens3 = 0.0f, fDens4 = 0.0f;
    1157             :         int hasPx;
    1158             : #if useImag != 0
    1159             :         vecf    fImag1 = 0.0f, fImag2 = 0.0f, fImag3 = 0.0f, fImag4 = 0.0f;
    1160             : 
    1161             :         //Get all the pixels for this row
    1162             :         hasPx  = getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
    1163             :                         useBandSrcValid, nBandSrcValid, (int2)(iSrcX-1, iSrcY+i),
    1164             :                         bandNum, &fDens1, &fReal1, &fImag1);
    1165             : 
    1166             :         hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
    1167             :                         useBandSrcValid, nBandSrcValid, (int2)(iSrcX  , iSrcY+i),
    1168             :                         bandNum, &fDens2, &fReal2, &fImag2);
    1169             : 
    1170             :         hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
    1171             :                         useBandSrcValid, nBandSrcValid, (int2)(iSrcX+1, iSrcY+i),
    1172             :                         bandNum, &fDens3, &fReal3, &fImag3);
    1173             : 
    1174             :         hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
    1175             :                         useBandSrcValid, nBandSrcValid, (int2)(iSrcX+2, iSrcY+i),
    1176             :                         bandNum, &fDens4, &fReal4, &fImag4);
    1177             : #else
    1178             :         //Get all the pixels for this row
    1179             :         hasPx  = getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
    1180             :                 useBandSrcValid, nBandSrcValid, (int2)(iSrcX-1, iSrcY+i),
    1181             :                 bandNum, &fDens1, &fReal1, &fImag);
    1182             : 
    1183             :         hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
    1184             :                 useBandSrcValid, nBandSrcValid, (int2)(iSrcX  , iSrcY+i),
    1185             :                 bandNum, &fDens2, &fReal2, &fImag);
    1186             : 
    1187             :         hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
    1188             :                 useBandSrcValid, nBandSrcValid, (int2)(iSrcX+1, iSrcY+i),
    1189             :                 bandNum, &fDens3, &fReal3, &fImag);
    1190             : 
    1191             :         hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
    1192             :                 useBandSrcValid, nBandSrcValid, (int2)(iSrcX+2, iSrcY+i),
    1193             :                 bandNum, &fDens4, &fReal4, &fImag);
    1194             : #endif
    1195             : 
    1196             :         // Shortcut if no px
    1197             :         if (!hasPx) {
    1198             :             afDens[i+1] = 0.0f;
    1199             :             afReal[i+1] = 0.0f;
    1200             : #if useImag != 0
    1201             :             afImag[i+1] = 0.0f;
    1202             : #endif
    1203             :             continue;
    1204             :         }
    1205             : 
    1206             :         // Process this row
    1207             :         afDens[i+1] = cubicConvolution(fDeltaX, fDeltaX2, fDeltaX3, fDens1, fDens2, fDens3, fDens4);
    1208             :         afReal[i+1] = cubicConvolution(fDeltaX, fDeltaX2, fDeltaX3, fReal1, fReal2, fReal3, fReal4);
    1209             : #if useImag != 0
    1210             :         afImag[i+1] = cubicConvolution(fDeltaX, fDeltaX2, fDeltaX3, fImag1, fImag2, fImag3, fImag4);
    1211             : #endif
    1212             :     }
    1213             : 
    1214             :     // Compute and save final pixel
    1215             :     setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
    1216             :              cubicConvolution(fDeltaY, fDeltaY2, fDeltaY3, afDens[0], afDens[1], afDens[2], afDens[3]),
    1217             :              cubicConvolution(fDeltaY, fDeltaY2, fDeltaY3, afReal[0], afReal[1], afReal[2], afReal[3]),
    1218             : #if useImag != 0
    1219             :              cubicConvolution(fDeltaY, fDeltaY2, fDeltaY3, afImag[0], afImag[1], afImag[2], afImag[3]) );
    1220             : #else
    1221             :              fImag );
    1222             : #endif
    1223             : }
    1224             : )"""";
    1225             : 
    1226           0 :     const char *kernResampler = R""""(
    1227             : // ************************ LanczosSinc ************************
    1228             : 
    1229             : float lanczosSinc( float fX, float fR );
    1230             : float bSpline( float x );
    1231             : 
    1232             : float lanczosSinc( float fX, float fR )
    1233             : {
    1234             :     if ( fX > fR || fX < -fR)
    1235             :         return 0.0f;
    1236             :     if ( fX == 0.0f )
    1237             :         return 1.0f;
    1238             : 
    1239             :     float fPIX = PI * fX;
    1240             :     return ( sin(fPIX) / fPIX ) * ( sin(fPIX / fR) * fR / fPIX );
    1241             : }
    1242             : 
    1243             : // ************************ Bicubic Spline ************************
    1244             : 
    1245             : float bSpline( float x )
    1246             : {
    1247             :     float xp2 = x + 2.0f;
    1248             :     float xp1 = x + 1.0f;
    1249             :     float xm1 = x - 1.0f;
    1250             :     float xp2c = xp2 * xp2 * xp2;
    1251             : 
    1252             :     return (((xp2 > 0.0f)?((xp1 > 0.0f)?((x > 0.0f)?((xm1 > 0.0f)?
    1253             :                                                      -4.0f * xm1*xm1*xm1:0.0f) +
    1254             :                                          6.0f * x*x*x:0.0f) +
    1255             :                            -4.0f * xp1*xp1*xp1:0.0f) +
    1256             :              xp2c:0.0f) ) * 0.166666666666666666666f;
    1257             : }
    1258             : 
    1259             : // ************************ General Resampler ************************
    1260             : 
    1261             : __kernel void resamp(__read_only image2d_t srcCoords,
    1262             :                      __read_only image2d_t srcReal,
    1263             :                      __read_only image2d_t srcImag,
    1264             :                      __global float *fUnifiedSrcDensity,
    1265             :                      __global int *nUnifiedSrcValid,
    1266             :                      __constant char *useBandSrcValid,
    1267             :                      __global int *nBandSrcValid,
    1268             :                      __global outType *dstReal,
    1269             :                      __global outType *dstImag,
    1270             :                      __constant float *fDstNoDataReal,
    1271             :                      __global float *dstDensity,
    1272             :                      __global int *nDstValid,
    1273             :                      const int bandNum)
    1274             : {
    1275             :     float2  fSrc = getSrcCoords(srcCoords);
    1276             : 
    1277             :     if (!isValid(fUnifiedSrcDensity, nUnifiedSrcValid, fSrc))
    1278             :         return;
    1279             : 
    1280             :     int     iSrcX = floor( fSrc.x - 0.5f );
    1281             :     int     iSrcY = floor( fSrc.y - 0.5f );
    1282             :     float   fDeltaX = fSrc.x - 0.5f - (float)iSrcX;
    1283             :     float   fDeltaY = fSrc.y - 0.5f - (float)iSrcY;
    1284             : 
    1285             :     vecf  fAccumulatorReal = 0.0f, fAccumulatorImag = 0.0f;
    1286             :     vecf  fAccumulatorDensity = 0.0f;
    1287             :     float fAccumulatorWeight = 0.0f;
    1288             :     int   i, j;
    1289             : 
    1290             :     // Loop over pixel rows in the kernel
    1291             :     for ( j = nFiltInitY; j <= nYRadius; ++j )
    1292             :     {
    1293             :         float   fWeight1;
    1294             :         int2 iSrc = (int2)(0, iSrcY + j);
    1295             : 
    1296             :         // Skip sampling over edge of image
    1297             :         if ( iSrc.y < 0 || iSrc.y >= iSrcHeight )
    1298             :             continue;
    1299             : 
    1300             :         // Select the resampling algorithm
    1301             :         if ( doCubicSpline )
    1302             :             // Calculate the Y weight
    1303             :             fWeight1 = ( fYScale < 1.0f ) ?
    1304             :                 bSpline(((float)j) * fYScale) * fYScale :
    1305             :                 bSpline(((float)j) - fDeltaY);
    1306             :         else
    1307             :             fWeight1 = ( fYScale < 1.0f ) ?
    1308             :                 lanczosSinc(j * fYScale, fYFilter) * fYScale :
    1309             :                 lanczosSinc(j - fDeltaY, fYFilter);
    1310             : 
    1311             :         // Iterate over pixels in row
    1312             :         for ( i = nFiltInitX; i <= nXRadius; ++i )
    1313             :         {
    1314             :             float fWeight2;
    1315             :             vecf fDensity = 0.0f, fReal, fImag;
    1316             :             iSrc.x = iSrcX + i;
    1317             : 
    1318             :             // Skip sampling at edge of image
    1319             :             // Skip sampling when invalid pixel
    1320             :             if ( iSrc.x < 0 || iSrc.x >= iSrcWidth ||
    1321             :                   !getPixel(srcReal, srcImag, fUnifiedSrcDensity,
    1322             :                             nUnifiedSrcValid, useBandSrcValid, nBandSrcValid,
    1323             :                             iSrc, bandNum, &fDensity, &fReal, &fImag) )
    1324             :                 continue;
    1325             : 
    1326             :             // Choose among possible algorithms
    1327             :             if ( doCubicSpline )
    1328             :                 // Calculate & save the X weight
    1329             :                 fWeight2 = fWeight1 * ((fXScale < 1.0f ) ?
    1330             :                     bSpline((float)i * fXScale) * fXScale :
    1331             :                     bSpline(fDeltaX - (float)i));
    1332             :             else
    1333             :                 // Calculate & save the X weight
    1334             :                 fWeight2 = fWeight1 * ((fXScale < 1.0f ) ?
    1335             :                     lanczosSinc(i * fXScale, fXFilter) * fXScale :
    1336             :                     lanczosSinc(i - fDeltaX, fXFilter));
    1337             : 
    1338             :             // Accumulate!
    1339             :             fAccumulatorReal += fReal * fWeight2;
    1340             :             fAccumulatorImag += fImag * fWeight2;
    1341             :             fAccumulatorDensity += fDensity * fWeight2;
    1342             :             fAccumulatorWeight += fWeight2;
    1343             :         }
    1344             :     }
    1345             : 
    1346             :     if ( fAccumulatorWeight < 0.000001f ) {
    1347             :         setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
    1348             :                  0.0f, 0.0f, 0.0f);
    1349             :     } else if ( fAccumulatorWeight < 0.99999f || fAccumulatorWeight > 1.00001f ) {
    1350             :         // Calculate the output taking into account weighting
    1351             :         setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
    1352             :                  fAccumulatorDensity / fAccumulatorWeight,
    1353             :                  fAccumulatorReal / fAccumulatorWeight,
    1354             : #if useImag != 0
    1355             :                  fAccumulatorImag / fAccumulatorWeight );
    1356             : #else
    1357             :                  0.0f );
    1358             : #endif
    1359             :     } else {
    1360             :         setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
    1361             :                  fAccumulatorDensity, fAccumulatorReal, fAccumulatorImag);
    1362             :     }
    1363             : }
    1364             : )"""";
    1365             : 
    1366             :     // Defines based on image format
    1367           0 :     switch (warper->imageFormat)
    1368             :     {
    1369           0 :         case CL_FLOAT:
    1370           0 :             dstMinVal = std::numeric_limits<float>::lowest();
    1371           0 :             dstMaxVal = std::numeric_limits<float>::max();
    1372           0 :             outType = "float";
    1373           0 :             break;
    1374           0 :         case CL_SNORM_INT8:
    1375           0 :             dstMinVal = -128.0;
    1376           0 :             dstMaxVal = 127.0;
    1377           0 :             outType = "char";
    1378           0 :             break;
    1379           0 :         case CL_UNORM_INT8:
    1380           0 :             dstMinVal = 0.0;
    1381           0 :             dstMaxVal = 255.0;
    1382           0 :             outType = "uchar";
    1383           0 :             break;
    1384           0 :         case CL_SNORM_INT16:
    1385           0 :             dstMinVal = -32768.0;
    1386           0 :             dstMaxVal = 32767.0;
    1387           0 :             outType = "short";
    1388           0 :             break;
    1389           0 :         case CL_UNORM_INT16:
    1390           0 :             dstMinVal = 0.0;
    1391           0 :             dstMaxVal = 65535.0;
    1392           0 :             outType = "ushort";
    1393           0 :             break;
    1394           0 :         default:
    1395           0 :             CPLError(CE_Failure, CPLE_AppDefined, "Unhandled imageFormat = %d",
    1396             :                      warper->imageFormat);
    1397           0 :             return nullptr;
    1398             :     }
    1399             : 
    1400             :     // Use vector format?
    1401           0 :     if (useVec)
    1402             :     {
    1403           0 :         dUseVec = "-D USE_VEC";
    1404           0 :         dVecf = "float4";
    1405             :     }
    1406             : 
    1407             :     // Assemble the kernel from parts. The compiler is unable to handle multiple
    1408             :     // kernels in one string with more than a few __constant modifiers each.
    1409           0 :     if (warper->resampAlg == OCL_Bilinear)
    1410           0 :         snprintf(&progBuf[0], PROGBUF_SIZE, "%s\n%s", kernGenFuncs,
    1411             :                  kernBilinear);
    1412           0 :     else if (warper->resampAlg == OCL_Cubic)
    1413           0 :         snprintf(&progBuf[0], PROGBUF_SIZE, "%s\n%s", kernGenFuncs, kernCubic);
    1414             :     else
    1415           0 :         snprintf(&progBuf[0], PROGBUF_SIZE, "%s\n%s", kernGenFuncs,
    1416             :                  kernResampler);
    1417             : 
    1418             :     // Actually make the program from assembled source
    1419           0 :     const char *pszProgBuf = progBuf.c_str();
    1420           0 :     program = clCreateProgramWithSource(warper->context, 1, &pszProgBuf,
    1421             :                                         nullptr, &err);
    1422           0 :     handleErrGoto(err, error_final);
    1423             : 
    1424             :     // Assemble the compiler arg string for speed. All invariants should be
    1425             :     // defined here.
    1426           0 :     snprintf(
    1427           0 :         &buffer[0], PROGBUF_SIZE,
    1428             :         "-cl-fast-relaxed-math -Werror -D FALSE=0 -D TRUE=1 "
    1429             :         "%s"
    1430             :         "-D iSrcWidth=%d -D iSrcHeight=%d -D iDstWidth=%d -D iDstHeight=%d "
    1431             :         "-D useUnifiedSrcDensity=%d -D useUnifiedSrcValid=%d "
    1432             :         "-D useDstDensity=%d -D useDstValid=%d -D useImag=%d "
    1433             :         "-D fXScale=%015.15lff -D fYScale=%015.15lff -D fXFilter=%015.15lff -D "
    1434             :         "fYFilter=%015.15lff "
    1435             :         "-D nXRadius=%d -D nYRadius=%d -D nFiltInitX=%d -D nFiltInitY=%d "
    1436             :         "-D PI=%015.15lff -D outType=%s -D dstMinVal=%015.15lff -D "
    1437             :         "dstMaxVal=%015.15lff "
    1438             :         "-D useDstNoDataReal=%d -D vecf=%s %s -D doCubicSpline=%d "
    1439             :         "-D useUseBandSrcValid=%d -D iCoordMult=%d ",
    1440           0 :         (warper->imageFormat == CL_FLOAT) ? "-D USE_CLAMP_TO_DST_FLOAT=1 " : "",
    1441             :         warper->srcWidth, warper->srcHeight, warper->dstWidth,
    1442             :         warper->dstHeight, warper->useUnifiedSrcDensity,
    1443             :         warper->useUnifiedSrcValid, warper->useDstDensity, warper->useDstValid,
    1444           0 :         warper->imagWorkCL != nullptr, dfXScale, dfYScale, dfXFilter, dfYFilter,
    1445             :         nXRadius, nYRadius, nFiltInitX, nFiltInitY, M_PI, outType, dstMinVal,
    1446           0 :         dstMaxVal, warper->fDstNoDataRealCL != nullptr, dVecf, dUseVec,
    1447           0 :         warper->resampAlg == OCL_CubicSpline,
    1448           0 :         warper->nBandSrcValidCL != nullptr, warper->coordMult);
    1449             : 
    1450           0 :     (*clErr) = err = clBuildProgram(program, 1, &(warper->dev), buffer.data(),
    1451             :                                     nullptr, nullptr);
    1452             : 
    1453             :     // Detailed debugging info
    1454           0 :     if (err != CL_SUCCESS)
    1455             :     {
    1456           0 :         const char *pszStatus = "unknown_status";
    1457           0 :         err = clGetProgramBuildInfo(program, warper->dev, CL_PROGRAM_BUILD_LOG,
    1458           0 :                                     PROGBUF_SIZE, &buffer[0], nullptr);
    1459           0 :         handleErrGoto(err, error_free_program);
    1460             : 
    1461           0 :         CPLError(CE_Failure, CPLE_AppDefined,
    1462             :                  "Error: Failed to build program executable!\nBuild Log:\n%s",
    1463             :                  buffer.c_str());
    1464             : 
    1465           0 :         err =
    1466           0 :             clGetProgramBuildInfo(program, warper->dev, CL_PROGRAM_BUILD_STATUS,
    1467           0 :                                   PROGBUF_SIZE, &buffer[0], nullptr);
    1468           0 :         handleErrGoto(err, error_free_program);
    1469             : 
    1470           0 :         if (buffer[0] == CL_BUILD_NONE)
    1471           0 :             pszStatus = "CL_BUILD_NONE";
    1472           0 :         else if (buffer[0] == CL_BUILD_ERROR)
    1473           0 :             pszStatus = "CL_BUILD_ERROR";
    1474           0 :         else if (buffer[0] == CL_BUILD_SUCCESS)
    1475           0 :             pszStatus = "CL_BUILD_SUCCESS";
    1476           0 :         else if (buffer[0] == CL_BUILD_IN_PROGRESS)
    1477           0 :             pszStatus = "CL_BUILD_IN_PROGRESS";
    1478             : 
    1479           0 :         CPLDebug("OpenCL", "Build Status: %s\nProgram Source:\n%s", pszStatus,
    1480             :                  progBuf.c_str());
    1481           0 :         goto error_free_program;
    1482             :     }
    1483             : 
    1484           0 :     kernel = clCreateKernel(program, "resamp", &err);
    1485           0 :     handleErrGoto(err, error_free_program);
    1486             : 
    1487           0 :     err = clReleaseProgram(program);
    1488           0 :     handleErrGoto(err, error_final);
    1489             : 
    1490           0 :     return kernel;
    1491             : 
    1492           0 : error_free_program:
    1493           0 :     err = clReleaseProgram(program);
    1494             : 
    1495           0 : error_final:
    1496           0 :     return nullptr;
    1497             : }
    1498             : 
    1499             : /*
    1500             :  Alloc & copy the coordinate data from host working memory to the device. The
    1501             :  working memory should be a pinned, linear, array of floats. This allows us to
    1502             :  allocate and copy all data in one step. The pointer to the device memory is
    1503             :  saved and set as the appropriate argument number.
    1504             : 
    1505             :  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
    1506             :  */
    1507           0 : static cl_int set_coord_data(struct oclWarper *warper, cl_mem *xy)
    1508             : {
    1509           0 :     cl_int err = CL_SUCCESS;
    1510             :     cl_image_format imgFmt;
    1511             : 
    1512             :     // Copy coord data to the device
    1513           0 :     imgFmt.image_channel_order = warper->xyChOrder;
    1514           0 :     imgFmt.image_channel_data_type = CL_FLOAT;
    1515             : 
    1516             : #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
    1517             : #pragma GCC diagnostic push
    1518             : #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
    1519             : #endif
    1520           0 :     (*xy) = clCreateImage2D(warper->context,
    1521             :                             CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &imgFmt,
    1522           0 :                             static_cast<size_t>(warper->xyWidth),
    1523           0 :                             static_cast<size_t>(warper->xyHeight),
    1524           0 :                             sizeof(float) * warper->xyChSize * warper->xyWidth,
    1525           0 :                             warper->xyWork, &err);
    1526             : #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
    1527             : #pragma GCC diagnostic pop
    1528             : #endif
    1529           0 :     handleErr(err);
    1530             : 
    1531             :     // Free the source memory, now that it's copied we don't need it
    1532           0 :     freeCLMem(warper->xyWorkCL, warper->xyWork);
    1533             : 
    1534             :     // Set up argument
    1535           0 :     if (warper->kern1 != nullptr)
    1536             :     {
    1537           0 :         handleErr(err = clSetKernelArg(warper->kern1, 0, sizeof(cl_mem), xy));
    1538             :     }
    1539           0 :     if (warper->kern4 != nullptr)
    1540             :     {
    1541           0 :         handleErr(err = clSetKernelArg(warper->kern4, 0, sizeof(cl_mem), xy));
    1542             :     }
    1543             : 
    1544           0 :     return CL_SUCCESS;
    1545             : }
    1546             : 
    1547             : /*
    1548             :  Sets the unified density & valid data structures. These are optional structures
    1549             :  from GDAL, and as such if they are NULL a small placeholder memory segment is
    1550             :  defined. This is because the spec is unclear on if a NULL value can be passed
    1551             :  as a kernel argument in place of memory. If it's not NULL, the data is copied
    1552             :  from the working memory to the device memory. After that, we check if we are
    1553             :  using the per-band validity mask, and set that as appropriate. At the end, the
    1554             :  CL mem is passed as the kernel arguments.
    1555             : 
    1556             :  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
    1557             :  */
    1558             : static cl_int
    1559           0 : set_unified_data(struct oclWarper *warper, cl_mem *unifiedSrcDensityCL,
    1560             :                  cl_mem *unifiedSrcValidCL, float *unifiedSrcDensity,
    1561             :                  unsigned int *unifiedSrcValid, cl_mem *useBandSrcValidCL,
    1562             :                  cl_mem *nBandSrcValidCL)
    1563             : {
    1564           0 :     cl_int err = CL_SUCCESS;
    1565           0 :     size_t sz = warper->srcWidth * warper->srcHeight;
    1566           0 :     int useValid = warper->nBandSrcValidCL != nullptr;
    1567             :     // 32 bits in the mask
    1568           0 :     int validSz = static_cast<int>(sizeof(int) * ((31 + sz) >> 5));
    1569             : 
    1570             :     // Copy unifiedSrcDensity if it exists
    1571           0 :     if (unifiedSrcDensity == nullptr)
    1572             :     {
    1573             :         // Alloc dummy device RAM
    1574           0 :         (*unifiedSrcDensityCL) =
    1575           0 :             clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
    1576           0 :         handleErr(err);
    1577             :     }
    1578             :     else
    1579             :     {
    1580             :         // Alloc & copy all density data
    1581           0 :         (*unifiedSrcDensityCL) = clCreateBuffer(
    1582             :             warper->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
    1583             :             sizeof(float) * sz, unifiedSrcDensity, &err);
    1584           0 :         handleErr(err);
    1585             :     }
    1586             : 
    1587             :     // Copy unifiedSrcValid if it exists
    1588           0 :     if (unifiedSrcValid == nullptr)
    1589             :     {
    1590             :         // Alloc dummy device RAM
    1591           0 :         (*unifiedSrcValidCL) =
    1592           0 :             clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
    1593           0 :         handleErr(err);
    1594             :     }
    1595             :     else
    1596             :     {
    1597             :         // Alloc & copy all validity data
    1598           0 :         (*unifiedSrcValidCL) = clCreateBuffer(
    1599             :             warper->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, validSz,
    1600             :             unifiedSrcValid, &err);
    1601           0 :         handleErr(err);
    1602             :     }
    1603             : 
    1604             :     // Set the band validity usage
    1605           0 :     if (useValid)
    1606             :     {
    1607           0 :         (*useBandSrcValidCL) = clCreateBuffer(
    1608             :             warper->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
    1609           0 :             sizeof(char) * warper->numBands, warper->useBandSrcValid, &err);
    1610           0 :         handleErr(err);
    1611             :     }
    1612             :     else
    1613             :     {
    1614             :         // Make a fake image so we don't have a NULL pointer
    1615           0 :         (*useBandSrcValidCL) =
    1616           0 :             clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
    1617           0 :         handleErr(err);
    1618             :     }
    1619             : 
    1620             :     // Do a more thorough check for validity
    1621           0 :     if (useValid)
    1622             :     {
    1623             :         int i;
    1624           0 :         useValid = FALSE;
    1625           0 :         for (i = 0; i < warper->numBands; ++i)
    1626           0 :             if (warper->useBandSrcValid[i])
    1627           0 :                 useValid = TRUE;
    1628             :     }
    1629             : 
    1630             :     // And the validity mask if needed
    1631           0 :     if (useValid)
    1632             :     {
    1633           0 :         (*nBandSrcValidCL) = clCreateBuffer(
    1634             :             warper->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
    1635           0 :             warper->numBands * validSz, warper->nBandSrcValid, &err);
    1636           0 :         handleErr(err);
    1637             :     }
    1638             :     else
    1639             :     {
    1640             :         // Make a fake image so we don't have a NULL pointer
    1641           0 :         (*nBandSrcValidCL) =
    1642           0 :             clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
    1643           0 :         handleErr(err);
    1644             :     }
    1645             : 
    1646             :     // Set up arguments
    1647           0 :     if (warper->kern1 != nullptr)
    1648             :     {
    1649           0 :         handleErr(err = clSetKernelArg(warper->kern1, 3, sizeof(cl_mem),
    1650             :                                        unifiedSrcDensityCL));
    1651           0 :         handleErr(err = clSetKernelArg(warper->kern1, 4, sizeof(cl_mem),
    1652             :                                        unifiedSrcValidCL));
    1653           0 :         handleErr(err = clSetKernelArg(warper->kern1, 5, sizeof(cl_mem),
    1654             :                                        useBandSrcValidCL));
    1655           0 :         handleErr(err = clSetKernelArg(warper->kern1, 6, sizeof(cl_mem),
    1656             :                                        nBandSrcValidCL));
    1657             :     }
    1658           0 :     if (warper->kern4 != nullptr)
    1659             :     {
    1660           0 :         handleErr(err = clSetKernelArg(warper->kern4, 3, sizeof(cl_mem),
    1661             :                                        unifiedSrcDensityCL));
    1662           0 :         handleErr(err = clSetKernelArg(warper->kern4, 4, sizeof(cl_mem),
    1663             :                                        unifiedSrcValidCL));
    1664           0 :         handleErr(err = clSetKernelArg(warper->kern4, 5, sizeof(cl_mem),
    1665             :                                        useBandSrcValidCL));
    1666           0 :         handleErr(err = clSetKernelArg(warper->kern4, 6, sizeof(cl_mem),
    1667             :                                        nBandSrcValidCL));
    1668             :     }
    1669             : 
    1670           0 :     return CL_SUCCESS;
    1671             : }
    1672             : 
    1673             : /*
    1674             :  Here we set the per-band raster data. First priority is the real raster data,
    1675             :  of course. Then, if applicable, we set the additional image channel. Once this
    1676             :  data is copied to the device, it can be freed on the host, so that is done
    1677             :  here. Finally the appropriate kernel arguments are set.
    1678             : 
    1679             :  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
    1680             :  */
    1681           0 : static cl_int set_src_rast_data(struct oclWarper *warper, int iNum, size_t sz,
    1682             :                                 cl_channel_order chOrder, cl_mem *srcReal,
    1683             :                                 cl_mem *srcImag)
    1684             : {
    1685             :     cl_image_format imgFmt;
    1686           0 :     cl_int err = CL_SUCCESS;
    1687           0 :     int useImagWork =
    1688           0 :         warper->imagWork.v != nullptr && warper->imagWork.v[iNum] != nullptr;
    1689             : 
    1690             :     // Set up image vars
    1691           0 :     imgFmt.image_channel_order = chOrder;
    1692           0 :     imgFmt.image_channel_data_type = warper->imageFormat;
    1693             : 
    1694             :     // Create & copy the source image
    1695             : #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
    1696             : #pragma GCC diagnostic push
    1697             : #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
    1698             : #endif
    1699             : 
    1700           0 :     (*srcReal) = clCreateImage2D(
    1701             :         warper->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &imgFmt,
    1702           0 :         static_cast<size_t>(warper->srcWidth),
    1703           0 :         static_cast<size_t>(warper->srcHeight), sz * warper->srcWidth,
    1704           0 :         warper->realWork.v[iNum], &err);
    1705           0 :     handleErr(err);
    1706             : 
    1707             :     // And the source image parts if needed
    1708           0 :     if (useImagWork)
    1709             :     {
    1710           0 :         (*srcImag) = clCreateImage2D(
    1711             :             warper->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &imgFmt,
    1712           0 :             static_cast<size_t>(warper->srcWidth),
    1713           0 :             static_cast<size_t>(warper->srcHeight), sz * warper->srcWidth,
    1714           0 :             warper->imagWork.v[iNum], &err);
    1715           0 :         handleErr(err);
    1716             :     }
    1717             :     else
    1718             :     {
    1719             :         // Make a fake image so we don't have a NULL pointer
    1720             : 
    1721             :         char dummyImageData[16];
    1722           0 :         (*srcImag) = clCreateImage2D(warper->context,
    1723             :                                      CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
    1724             :                                      &imgFmt, 1, 1, sz, dummyImageData, &err);
    1725             : 
    1726           0 :         handleErr(err);
    1727             :     }
    1728             : #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
    1729             : #pragma GCC diagnostic pop
    1730             : #endif
    1731             : 
    1732             :     // Free the source memory, now that it's copied we don't need it
    1733           0 :     freeCLMem(warper->realWorkCL[iNum], warper->realWork.v[iNum]);
    1734           0 :     if (warper->imagWork.v != nullptr)
    1735             :     {
    1736           0 :         freeCLMem(warper->imagWorkCL[iNum], warper->imagWork.v[iNum]);
    1737             :     }
    1738             : 
    1739             :     // Set up per-band arguments
    1740           0 :     if (warper->kern1 != nullptr)
    1741             :     {
    1742           0 :         handleErr(
    1743             :             err = clSetKernelArg(warper->kern1, 1, sizeof(cl_mem), srcReal));
    1744           0 :         handleErr(
    1745             :             err = clSetKernelArg(warper->kern1, 2, sizeof(cl_mem), srcImag));
    1746             :     }
    1747           0 :     if (warper->kern4 != nullptr)
    1748             :     {
    1749           0 :         handleErr(
    1750             :             err = clSetKernelArg(warper->kern4, 1, sizeof(cl_mem), srcReal));
    1751           0 :         handleErr(
    1752             :             err = clSetKernelArg(warper->kern4, 2, sizeof(cl_mem), srcImag));
    1753             :     }
    1754             : 
    1755           0 :     return CL_SUCCESS;
    1756             : }
    1757             : 
    1758             : /*
    1759             :  Set the destination data for the raster. Although it's the output, it still
    1760             :  is copied to the device because some blending is done there. First the real
    1761             :  data is allocated and copied, then the imag data is allocated and copied if
    1762             :  needed. They are then set as the appropriate arguments to the kernel.
    1763             : 
    1764             :  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
    1765             :  */
    1766           0 : static cl_int set_dst_rast_data(struct oclWarper *warper, int iImg, size_t sz,
    1767             :                                 cl_mem *dstReal, cl_mem *dstImag)
    1768             : {
    1769           0 :     cl_int err = CL_SUCCESS;
    1770           0 :     sz *= warper->dstWidth * warper->dstHeight;
    1771             : 
    1772             :     // Copy the dst real data
    1773           0 :     (*dstReal) = clCreateBuffer(warper->context,
    1774             :                                 CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sz,
    1775           0 :                                 warper->dstRealWork.v[iImg], &err);
    1776           0 :     handleErr(err);
    1777             : 
    1778             :     // Copy the dst imag data if exists
    1779           0 :     if (warper->dstImagWork.v != nullptr &&
    1780           0 :         warper->dstImagWork.v[iImg] != nullptr)
    1781             :     {
    1782           0 :         (*dstImag) = clCreateBuffer(warper->context,
    1783             :                                     CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
    1784           0 :                                     sz, warper->dstImagWork.v[iImg], &err);
    1785           0 :         handleErr(err);
    1786             :     }
    1787             :     else
    1788             :     {
    1789           0 :         (*dstImag) = clCreateBuffer(warper->context, CL_MEM_READ_WRITE, 1,
    1790             :                                     nullptr, &err);
    1791           0 :         handleErr(err);
    1792             :     }
    1793             : 
    1794             :     // Set up per-band arguments
    1795           0 :     if (warper->kern1 != nullptr)
    1796             :     {
    1797           0 :         handleErr(
    1798             :             err = clSetKernelArg(warper->kern1, 7, sizeof(cl_mem), dstReal));
    1799           0 :         handleErr(
    1800             :             err = clSetKernelArg(warper->kern1, 8, sizeof(cl_mem), dstImag));
    1801             :     }
    1802           0 :     if (warper->kern4 != nullptr)
    1803             :     {
    1804           0 :         handleErr(
    1805             :             err = clSetKernelArg(warper->kern4, 7, sizeof(cl_mem), dstReal));
    1806           0 :         handleErr(
    1807             :             err = clSetKernelArg(warper->kern4, 8, sizeof(cl_mem), dstImag));
    1808             :     }
    1809             : 
    1810           0 :     return CL_SUCCESS;
    1811             : }
    1812             : 
    1813             : /*
    1814             :  Read the final raster data back from the graphics card to working memory. This
    1815             :  copies both the real memory and the imag memory if appropriate.
    1816             : 
    1817             :  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
    1818             :  */
    1819           0 : static cl_int get_dst_rast_data(struct oclWarper *warper, int iImg,
    1820             :                                 size_t wordSz, cl_mem dstReal, cl_mem dstImag)
    1821             : {
    1822           0 :     cl_int err = CL_SUCCESS;
    1823           0 :     size_t sz = warper->dstWidth * warper->dstHeight * wordSz;
    1824             : 
    1825             :     // Copy from dev into working memory
    1826           0 :     handleErr(err = clEnqueueReadBuffer(warper->queue, dstReal, CL_FALSE, 0, sz,
    1827             :                                         warper->dstRealWork.v[iImg], 0, nullptr,
    1828             :                                         nullptr));
    1829             : 
    1830             :     // If we are expecting the imag channel, then copy it back also
    1831           0 :     if (warper->dstImagWork.v != nullptr &&
    1832           0 :         warper->dstImagWork.v[iImg] != nullptr)
    1833             :     {
    1834           0 :         handleErr(err = clEnqueueReadBuffer(warper->queue, dstImag, CL_FALSE, 0,
    1835             :                                             sz, warper->dstImagWork.v[iImg], 0,
    1836             :                                             nullptr, nullptr));
    1837             :     }
    1838             : 
    1839             :     // The copy requests were non-blocking, so we'll need to make sure they
    1840             :     // finish.
    1841           0 :     handleErr(err = clFinish(warper->queue));
    1842             : 
    1843           0 :     return CL_SUCCESS;
    1844             : }
    1845             : 
    1846             : /*
    1847             :  Set the destination image density & validity mask on the device. This is used
    1848             :  to blend the final output image with the existing buffer. This handles the
    1849             :  unified structures that apply to all bands. After the buffers are created and
    1850             :  copied, they are set as kernel arguments.
    1851             : 
    1852             :  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
    1853             :  */
    1854           0 : static cl_int set_dst_data(struct oclWarper *warper, cl_mem *dstDensityCL,
    1855             :                            cl_mem *dstValidCL, cl_mem *dstNoDataRealCL,
    1856             :                            float *dstDensity, unsigned int *dstValid,
    1857             :                            float *dstNoDataReal)
    1858             : {
    1859           0 :     cl_int err = CL_SUCCESS;
    1860           0 :     size_t sz = warper->dstWidth * warper->dstHeight;
    1861             : 
    1862             :     // Copy the no-data value(s)
    1863           0 :     if (dstNoDataReal == nullptr)
    1864             :     {
    1865           0 :         (*dstNoDataRealCL) =
    1866           0 :             clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
    1867           0 :         handleErr(err);
    1868             :     }
    1869             :     else
    1870             :     {
    1871           0 :         (*dstNoDataRealCL) = clCreateBuffer(
    1872             :             warper->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
    1873           0 :             sizeof(float) * warper->numBands, dstNoDataReal, &err);
    1874           0 :         handleErr(err);
    1875             :     }
    1876             : 
    1877             :     // Copy unifiedSrcDensity if it exists
    1878           0 :     if (dstDensity == nullptr)
    1879             :     {
    1880           0 :         (*dstDensityCL) =
    1881           0 :             clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
    1882           0 :         handleErr(err);
    1883             :     }
    1884             :     else
    1885             :     {
    1886           0 :         (*dstDensityCL) = clCreateBuffer(
    1887             :             warper->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
    1888             :             sizeof(float) * sz, dstDensity, &err);
    1889           0 :         handleErr(err);
    1890             :     }
    1891             : 
    1892             :     // Copy unifiedSrcValid if it exists
    1893           0 :     if (dstValid == nullptr)
    1894             :     {
    1895           0 :         (*dstValidCL) =
    1896           0 :             clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
    1897           0 :         handleErr(err);
    1898             :     }
    1899             :     else
    1900             :     {
    1901           0 :         (*dstValidCL) = clCreateBuffer(
    1902             :             warper->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
    1903           0 :             sizeof(int) * ((31 + sz) >> 5), dstValid, &err);
    1904           0 :         handleErr(err);
    1905             :     }
    1906             : 
    1907             :     // Set up arguments
    1908           0 :     if (warper->kern1 != nullptr)
    1909             :     {
    1910           0 :         handleErr(err = clSetKernelArg(warper->kern1, 9, sizeof(cl_mem),
    1911             :                                        dstNoDataRealCL));
    1912           0 :         handleErr(err = clSetKernelArg(warper->kern1, 10, sizeof(cl_mem),
    1913             :                                        dstDensityCL));
    1914           0 :         handleErr(err = clSetKernelArg(warper->kern1, 11, sizeof(cl_mem),
    1915             :                                        dstValidCL));
    1916             :     }
    1917           0 :     if (warper->kern4 != nullptr)
    1918             :     {
    1919           0 :         handleErr(err = clSetKernelArg(warper->kern4, 9, sizeof(cl_mem),
    1920             :                                        dstNoDataRealCL));
    1921           0 :         handleErr(err = clSetKernelArg(warper->kern4, 10, sizeof(cl_mem),
    1922             :                                        dstDensityCL));
    1923           0 :         handleErr(err = clSetKernelArg(warper->kern4, 11, sizeof(cl_mem),
    1924             :                                        dstValidCL));
    1925             :     }
    1926             : 
    1927           0 :     return CL_SUCCESS;
    1928             : }
    1929             : 
    1930             : /*
    1931             :  Go ahead and execute the kernel. This handles some housekeeping stuff like the
    1932             :  run dimensions. When running in debug mode, it times the kernel call and prints
    1933             :  the execution time.
    1934             : 
    1935             :  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
    1936             :  */
    1937           0 : static cl_int execute_kern(struct oclWarper *warper, cl_kernel kern,
    1938             :                            size_t loc_size)
    1939             : {
    1940           0 :     cl_int err = CL_SUCCESS;
    1941             :     cl_event ev;
    1942             :     size_t ceil_runs[2];
    1943             :     size_t group_size[2];
    1944             : #ifdef DEBUG_OPENCL
    1945             :     size_t start_time = 0;
    1946             :     size_t end_time;
    1947             :     const char *vecTxt = "";
    1948             : #endif
    1949             : 
    1950             :     // Use a likely X-dimension which is a power of 2
    1951           0 :     if (loc_size >= 512)
    1952           0 :         group_size[0] = 32;
    1953           0 :     else if (loc_size >= 64)
    1954           0 :         group_size[0] = 16;
    1955           0 :     else if (loc_size > 8)
    1956           0 :         group_size[0] = 8;
    1957             :     else
    1958           0 :         group_size[0] = 1;
    1959             : 
    1960           0 :     if (group_size[0] > loc_size)
    1961           0 :         group_size[1] = group_size[0] / loc_size;
    1962             :     else
    1963           0 :         group_size[1] = 1;
    1964             : 
    1965             :     // Round up num_runs to find the dim of the block of pixels we'll be
    1966             :     // processing
    1967           0 :     if (warper->dstWidth % group_size[0])
    1968           0 :         ceil_runs[0] =
    1969           0 :             warper->dstWidth + group_size[0] - warper->dstWidth % group_size[0];
    1970             :     else
    1971           0 :         ceil_runs[0] = warper->dstWidth;
    1972             : 
    1973           0 :     if (warper->dstHeight % group_size[1])
    1974           0 :         ceil_runs[1] = warper->dstHeight + group_size[1] -
    1975           0 :                        warper->dstHeight % group_size[1];
    1976             :     else
    1977           0 :         ceil_runs[1] = warper->dstHeight;
    1978             : 
    1979             : #ifdef DEBUG_OPENCL
    1980             :     handleErr(err = clSetCommandQueueProperty(
    1981             :                   warper->queue, CL_QUEUE_PROFILING_ENABLE, CL_TRUE, nullptr));
    1982             : #endif
    1983             : 
    1984             :     // Run the calculation by enqueuing it and forcing the
    1985             :     // command queue to complete the task
    1986           0 :     handleErr(err = clEnqueueNDRangeKernel(warper->queue, kern, 2, nullptr,
    1987             :                                            ceil_runs, group_size, 0, nullptr,
    1988             :                                            &ev));
    1989           0 :     handleErr(err = clFinish(warper->queue));
    1990             : 
    1991             : #ifdef DEBUG_OPENCL
    1992             :     handleErr(err = clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_START,
    1993             :                                             sizeof(size_t), &start_time,
    1994             :                                             nullptr));
    1995             :     handleErr(err =
    1996             :                   clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_END,
    1997             :                                           sizeof(size_t), &end_time, nullptr));
    1998             :     assert(end_time != 0);
    1999             :     assert(start_time != 0);
    2000             :     if (kern == warper->kern4)
    2001             :         vecTxt = "(vec)";
    2002             : 
    2003             :     CPLDebug("OpenCL", "Kernel Time: %6s %10lu", vecTxt,
    2004             :              static_cast<long int>((end_time - start_time) / 100000));
    2005             : #endif
    2006             : 
    2007           0 :     handleErr(err = clReleaseEvent(ev));
    2008           0 :     return CL_SUCCESS;
    2009             : }
    2010             : 
    2011             : /*
    2012             :  Copy data from a raw source to the warper's working memory. If the imag
    2013             :  channel is expected, then the data will be de-interlaced into component blocks
    2014             :  of memory.
    2015             : 
    2016             :  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
    2017             :  */
    2018           0 : static cl_int set_img_data(struct oclWarper *warper, void *srcImgData,
    2019             :                            unsigned int width, unsigned int height, int isSrc,
    2020             :                            unsigned int bandNum, void **dstRealImgs,
    2021             :                            void **dstImagImgs)
    2022             : {
    2023           0 :     unsigned int imgChSize = warper->imgChSize1;
    2024             :     unsigned int iSrcY, i;
    2025           0 :     unsigned int vecOff = 0;
    2026           0 :     unsigned int imgNum = bandNum;
    2027           0 :     void *dstReal = nullptr;
    2028           0 :     void *dstImag = nullptr;
    2029             : 
    2030             :     // Handle vector if needed
    2031           0 :     if (warper->useVec &&
    2032           0 :         static_cast<int>(bandNum) < warper->numBands - warper->numBands % 4)
    2033             :     {
    2034           0 :         imgChSize = warper->imgChSize4;
    2035           0 :         vecOff = bandNum % 4;
    2036           0 :         imgNum = bandNum / 4;
    2037             :     }
    2038           0 :     else if (warper->useVec)
    2039             :     {
    2040           0 :         imgNum = bandNum / 4 + bandNum % 4;
    2041             :     }
    2042             : 
    2043             :     // Set the images as needed
    2044           0 :     dstReal = dstRealImgs[imgNum];
    2045           0 :     if (dstImagImgs == nullptr)
    2046           0 :         dstImag = nullptr;
    2047             :     else
    2048           0 :         dstImag = dstImagImgs[imgNum];
    2049             : 
    2050             :     // Set stuff for dst imgs
    2051           0 :     if (!isSrc)
    2052             :     {
    2053           0 :         vecOff *= height * width;
    2054           0 :         imgChSize = 1;
    2055             :     }
    2056             : 
    2057             :     // Copy values as needed
    2058           0 :     if (warper->imagWorkCL == nullptr && !(warper->useVec && isSrc))
    2059             :     {
    2060             :         // Set memory size & location depending on the data type
    2061             :         // This is the ideal code path for speed
    2062           0 :         switch (warper->imageFormat)
    2063             :         {
    2064           0 :             case CL_UNORM_INT8:
    2065             :             {
    2066           0 :                 unsigned char *realDst =
    2067           0 :                     &((static_cast<unsigned char *>(dstReal))[vecOff]);
    2068           0 :                 memcpy(realDst, srcImgData,
    2069           0 :                        width * height * sizeof(unsigned char));
    2070           0 :                 break;
    2071             :             }
    2072           0 :             case CL_SNORM_INT8:
    2073             :             {
    2074           0 :                 char *realDst = &((static_cast<char *>(dstReal))[vecOff]);
    2075           0 :                 memcpy(realDst, srcImgData, width * height * sizeof(char));
    2076           0 :                 break;
    2077             :             }
    2078           0 :             case CL_UNORM_INT16:
    2079             :             {
    2080           0 :                 unsigned short *realDst =
    2081           0 :                     &((static_cast<unsigned short *>(dstReal))[vecOff]);
    2082           0 :                 memcpy(realDst, srcImgData,
    2083           0 :                        width * height * sizeof(unsigned short));
    2084           0 :                 break;
    2085             :             }
    2086           0 :             case CL_SNORM_INT16:
    2087             :             {
    2088           0 :                 short *realDst = &((static_cast<short *>(dstReal))[vecOff]);
    2089           0 :                 memcpy(realDst, srcImgData, width * height * sizeof(short));
    2090           0 :                 break;
    2091             :             }
    2092           0 :             case CL_FLOAT:
    2093             :             {
    2094           0 :                 float *realDst = &((static_cast<float *>(dstReal))[vecOff]);
    2095           0 :                 memcpy(realDst, srcImgData, width * height * sizeof(float));
    2096           0 :                 break;
    2097             :             }
    2098           0 :         }
    2099             :     }
    2100           0 :     else if (warper->imagWorkCL == nullptr)
    2101             :     {
    2102             :         // We need to space the values due to OpenCL implementation reasons
    2103           0 :         for (iSrcY = 0; iSrcY < height; iSrcY++)
    2104             :         {
    2105           0 :             int pxOff = width * iSrcY;
    2106           0 :             int imgOff = imgChSize * pxOff + vecOff;
    2107             :             // Copy & deinterleave interleaved data
    2108           0 :             switch (warper->imageFormat)
    2109             :             {
    2110           0 :                 case CL_UNORM_INT8:
    2111             :                 {
    2112           0 :                     unsigned char *realDst =
    2113           0 :                         &((static_cast<unsigned char *>(dstReal))[imgOff]);
    2114           0 :                     unsigned char *dataSrc =
    2115           0 :                         &((static_cast<unsigned char *>(srcImgData))[pxOff]);
    2116           0 :                     for (i = 0; i < width; ++i)
    2117           0 :                         realDst[imgChSize * i] = dataSrc[i];
    2118             :                 }
    2119           0 :                 break;
    2120           0 :                 case CL_SNORM_INT8:
    2121             :                 {
    2122           0 :                     char *realDst = &((static_cast<char *>(dstReal))[imgOff]);
    2123           0 :                     char *dataSrc = &((static_cast<char *>(srcImgData))[pxOff]);
    2124           0 :                     for (i = 0; i < width; ++i)
    2125           0 :                         realDst[imgChSize * i] = dataSrc[i];
    2126             :                 }
    2127           0 :                 break;
    2128           0 :                 case CL_UNORM_INT16:
    2129             :                 {
    2130           0 :                     unsigned short *realDst =
    2131           0 :                         &((static_cast<unsigned short *>(dstReal))[imgOff]);
    2132           0 :                     unsigned short *dataSrc =
    2133           0 :                         &((static_cast<unsigned short *>(srcImgData))[pxOff]);
    2134           0 :                     for (i = 0; i < width; ++i)
    2135           0 :                         realDst[imgChSize * i] = dataSrc[i];
    2136             :                 }
    2137           0 :                 break;
    2138           0 :                 case CL_SNORM_INT16:
    2139             :                 {
    2140           0 :                     short *realDst = &((static_cast<short *>(dstReal))[imgOff]);
    2141           0 :                     short *dataSrc =
    2142           0 :                         &((static_cast<short *>(srcImgData))[pxOff]);
    2143           0 :                     for (i = 0; i < width; ++i)
    2144           0 :                         realDst[imgChSize * i] = dataSrc[i];
    2145             :                 }
    2146           0 :                 break;
    2147           0 :                 case CL_FLOAT:
    2148             :                 {
    2149           0 :                     float *realDst = &((static_cast<float *>(dstReal))[imgOff]);
    2150           0 :                     float *dataSrc =
    2151           0 :                         &((static_cast<float *>(srcImgData))[pxOff]);
    2152           0 :                     for (i = 0; i < width; ++i)
    2153           0 :                         realDst[imgChSize * i] = dataSrc[i];
    2154             :                 }
    2155           0 :                 break;
    2156             :             }
    2157             :         }
    2158             :     }
    2159             :     else
    2160             :     {
    2161           0 :         assert(dstImag);
    2162             : 
    2163             :         // Copy, deinterleave, & space interleaved data
    2164           0 :         for (iSrcY = 0; iSrcY < height; iSrcY++)
    2165             :         {
    2166           0 :             int pxOff = width * iSrcY;
    2167           0 :             int imgOff = imgChSize * pxOff + vecOff;
    2168             :             // Copy & deinterleave interleaved data
    2169           0 :             switch (warper->imageFormat)
    2170             :             {
    2171           0 :                 case CL_FLOAT:
    2172             :                 {
    2173           0 :                     float *realDst = &((static_cast<float *>(dstReal))[imgOff]);
    2174           0 :                     float *imagDst = &((static_cast<float *>(dstImag))[imgOff]);
    2175           0 :                     float *dataSrc =
    2176           0 :                         &((static_cast<float *>(srcImgData))[pxOff]);
    2177           0 :                     for (i = 0; i < width; ++i)
    2178             :                     {
    2179           0 :                         realDst[imgChSize * i] = dataSrc[i * 2];
    2180           0 :                         imagDst[imgChSize * i] = dataSrc[i * 2 + 1];
    2181             :                     }
    2182             :                 }
    2183           0 :                 break;
    2184           0 :                 case CL_SNORM_INT8:
    2185             :                 {
    2186           0 :                     char *realDst = &((static_cast<char *>(dstReal))[imgOff]);
    2187           0 :                     char *imagDst = &((static_cast<char *>(dstImag))[imgOff]);
    2188           0 :                     char *dataSrc = &((static_cast<char *>(srcImgData))[pxOff]);
    2189           0 :                     for (i = 0; i < width; ++i)
    2190             :                     {
    2191           0 :                         realDst[imgChSize * i] = dataSrc[i * 2];
    2192           0 :                         imagDst[imgChSize * i] = dataSrc[i * 2 + 1];
    2193             :                     }
    2194             :                 }
    2195           0 :                 break;
    2196           0 :                 case CL_UNORM_INT8:
    2197             :                 {
    2198           0 :                     unsigned char *realDst =
    2199           0 :                         &((static_cast<unsigned char *>(dstReal))[imgOff]);
    2200           0 :                     unsigned char *imagDst =
    2201           0 :                         &((static_cast<unsigned char *>(dstImag))[imgOff]);
    2202           0 :                     unsigned char *dataSrc =
    2203           0 :                         &((static_cast<unsigned char *>(srcImgData))[pxOff]);
    2204           0 :                     for (i = 0; i < width; ++i)
    2205             :                     {
    2206           0 :                         realDst[imgChSize * i] = dataSrc[i * 2];
    2207           0 :                         imagDst[imgChSize * i] = dataSrc[i * 2 + 1];
    2208             :                     }
    2209             :                 }
    2210           0 :                 break;
    2211           0 :                 case CL_SNORM_INT16:
    2212             :                 {
    2213           0 :                     short *realDst = &((static_cast<short *>(dstReal))[imgOff]);
    2214           0 :                     short *imagDst = &((static_cast<short *>(dstImag))[imgOff]);
    2215           0 :                     short *dataSrc =
    2216           0 :                         &((static_cast<short *>(srcImgData))[pxOff]);
    2217           0 :                     for (i = 0; i < width; ++i)
    2218             :                     {
    2219           0 :                         realDst[imgChSize * i] = dataSrc[i * 2];
    2220           0 :                         imagDst[imgChSize * i] = dataSrc[i * 2 + 1];
    2221             :                     }
    2222             :                 }
    2223           0 :                 break;
    2224           0 :                 case CL_UNORM_INT16:
    2225             :                 {
    2226           0 :                     unsigned short *realDst =
    2227           0 :                         &((static_cast<unsigned short *>(dstReal))[imgOff]);
    2228           0 :                     unsigned short *imagDst =
    2229           0 :                         &((static_cast<unsigned short *>(dstImag))[imgOff]);
    2230           0 :                     unsigned short *dataSrc =
    2231           0 :                         &((static_cast<unsigned short *>(srcImgData))[pxOff]);
    2232           0 :                     for (i = 0; i < width; ++i)
    2233             :                     {
    2234           0 :                         realDst[imgChSize * i] = dataSrc[i * 2];
    2235           0 :                         imagDst[imgChSize * i] = dataSrc[i * 2 + 1];
    2236             :                     }
    2237             :                 }
    2238           0 :                 break;
    2239             :             }
    2240             :         }
    2241             :     }
    2242             : 
    2243           0 :     return CL_SUCCESS;
    2244             : }
    2245             : 
    2246             : /*
    2247             :  Creates the struct which inits & contains the OpenCL context & environment.
    2248             :  Inits wired(?) space to buffer the image in host RAM. Chooses the OpenCL
    2249             :  device, perhaps the user can choose it later? This would also choose the
    2250             :  appropriate OpenCL image format (R, RG, RGBA, or multiples thereof). Space
    2251             :  for metadata can be allocated as required, though.
    2252             : 
    2253             :  Supported image formats are:
    2254             :  CL_FLOAT, CL_SNORM_INT8, CL_UNORM_INT8, CL_SNORM_INT16, CL_UNORM_INT16
    2255             :  32-bit int formats won't keep precision when converted to floats internally
    2256             :  and doubles are generally not supported on the GPU image formats.
    2257             :  */
    2258           0 : struct oclWarper *GDALWarpKernelOpenCL_createEnv(
    2259             :     int srcWidth, int srcHeight, int dstWidth, int dstHeight,
    2260             :     cl_channel_type imageFormat, int numBands, int coordMult, int useImag,
    2261             :     int useBandSrcValid, CPL_UNUSED float *fDstDensity, double *dfDstNoDataReal,
    2262             :     OCLResampAlg resampAlg, cl_int *clErr)
    2263             : {
    2264             :     struct oclWarper *warper;
    2265             :     int i;
    2266           0 :     size_t maxWidth = 0, maxHeight = 0;
    2267           0 :     cl_int err = CL_SUCCESS;
    2268             :     size_t fmtSize, sz;
    2269             :     cl_device_id device;
    2270             :     cl_bool bool_flag;
    2271           0 :     OCLVendor eCLVendor = VENDOR_OTHER;
    2272             : 
    2273             :     // Do we have a suitable OpenCL device?
    2274           0 :     device = get_device(&eCLVendor);
    2275           0 :     if (device == nullptr)
    2276           0 :         return nullptr;
    2277             : 
    2278           0 :     err = clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool),
    2279             :                           &bool_flag, &sz);
    2280           0 :     if (err != CL_SUCCESS || !bool_flag)
    2281             :     {
    2282           0 :         CPLDebug("OpenCL", "No image support on selected device.");
    2283           0 :         return nullptr;
    2284             :     }
    2285             : 
    2286             :     // Set up warper environment.
    2287             :     warper =
    2288           0 :         static_cast<struct oclWarper *>(CPLCalloc(1, sizeof(struct oclWarper)));
    2289             : 
    2290           0 :     warper->eCLVendor = eCLVendor;
    2291             : 
    2292             :     // Init passed vars
    2293           0 :     warper->srcWidth = srcWidth;
    2294           0 :     warper->srcHeight = srcHeight;
    2295           0 :     warper->dstWidth = dstWidth;
    2296           0 :     warper->dstHeight = dstHeight;
    2297             : 
    2298           0 :     warper->coordMult = coordMult;
    2299           0 :     warper->numBands = numBands;
    2300           0 :     warper->imageFormat = imageFormat;
    2301           0 :     warper->resampAlg = resampAlg;
    2302             : 
    2303           0 :     warper->useUnifiedSrcDensity = FALSE;
    2304           0 :     warper->useUnifiedSrcValid = FALSE;
    2305           0 :     warper->useDstDensity = FALSE;
    2306           0 :     warper->useDstValid = FALSE;
    2307             : 
    2308           0 :     warper->imagWorkCL = nullptr;
    2309           0 :     warper->dstImagWorkCL = nullptr;
    2310           0 :     warper->useBandSrcValidCL = nullptr;
    2311           0 :     warper->useBandSrcValid = nullptr;
    2312           0 :     warper->nBandSrcValidCL = nullptr;
    2313           0 :     warper->nBandSrcValid = nullptr;
    2314           0 :     warper->fDstNoDataRealCL = nullptr;
    2315           0 :     warper->fDstNoDataReal = nullptr;
    2316           0 :     warper->kern1 = nullptr;
    2317           0 :     warper->kern4 = nullptr;
    2318             : 
    2319           0 :     warper->dev = device;
    2320             : 
    2321           0 :     warper->context =
    2322           0 :         clCreateContext(nullptr, 1, &(warper->dev), nullptr, nullptr, &err);
    2323           0 :     handleErrGoto(err, error_label);
    2324             : #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
    2325             : #pragma GCC diagnostic push
    2326             : #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
    2327             : #endif
    2328           0 :     warper->queue = clCreateCommandQueue(warper->context, warper->dev, 0, &err);
    2329             : #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
    2330             : #pragma GCC diagnostic pop
    2331             : #endif
    2332           0 :     handleErrGoto(err, error_label);
    2333             : 
    2334             :     // Ensure that we hand handle imagery of these dimensions
    2335           0 :     err = clGetDeviceInfo(warper->dev, CL_DEVICE_IMAGE2D_MAX_WIDTH,
    2336             :                           sizeof(size_t), &maxWidth, &sz);
    2337           0 :     handleErrGoto(err, error_label);
    2338           0 :     err = clGetDeviceInfo(warper->dev, CL_DEVICE_IMAGE2D_MAX_HEIGHT,
    2339             :                           sizeof(size_t), &maxHeight, &sz);
    2340           0 :     handleErrGoto(err, error_label);
    2341           0 :     if (maxWidth < static_cast<size_t>(srcWidth) ||
    2342           0 :         maxHeight < static_cast<size_t>(srcHeight))
    2343             :     {
    2344           0 :         err = CL_INVALID_IMAGE_SIZE;
    2345           0 :         handleErrGoto(err, error_label);
    2346             :     }
    2347             : 
    2348             :     // Split bands into sets of four when possible
    2349             :     // Cubic runs slower as vector, so don't use it (probably register pressure)
    2350             :     // Feel free to do more testing and come up with more precise case
    2351             :     // statements
    2352           0 :     if (numBands < 4 || resampAlg == OCL_Cubic)
    2353             :     {
    2354           0 :         warper->numImages = numBands;
    2355           0 :         warper->useVec = FALSE;
    2356             :     }
    2357             :     else
    2358             :     {
    2359           0 :         warper->numImages = numBands / 4 + numBands % 4;
    2360           0 :         warper->useVec = TRUE;
    2361             :     }
    2362             : 
    2363             :     // Make the pointer space for the real images
    2364           0 :     warper->realWorkCL =
    2365           0 :         static_cast<cl_mem *>(CPLCalloc(sizeof(cl_mem), warper->numImages));
    2366           0 :     warper->dstRealWorkCL =
    2367           0 :         static_cast<cl_mem *>(CPLCalloc(sizeof(cl_mem), warper->numImages));
    2368             : 
    2369             :     // Make space for the per-channel Imag data (if exists)
    2370           0 :     if (useImag)
    2371             :     {
    2372           0 :         warper->imagWorkCL =
    2373           0 :             static_cast<cl_mem *>(CPLCalloc(sizeof(cl_mem), warper->numImages));
    2374           0 :         warper->dstImagWorkCL =
    2375           0 :             static_cast<cl_mem *>(CPLCalloc(sizeof(cl_mem), warper->numImages));
    2376             :     }
    2377             : 
    2378             :     // Make space for the per-band BandSrcValid data (if exists)
    2379           0 :     if (useBandSrcValid)
    2380             :     {
    2381             :         // 32 bits in the mask
    2382           0 :         sz = warper->numBands *
    2383           0 :              ((31 + warper->srcWidth * warper->srcHeight) >> 5);
    2384             : 
    2385             :         // Allocate some space for the validity of the validity mask
    2386             :         void *useBandSrcValidTab[1];
    2387             :         cl_mem useBandSrcValidCLTab[1];
    2388           0 :         err = alloc_pinned_mem(warper, 0, warper->numBands * sizeof(char),
    2389             :                                useBandSrcValidTab, useBandSrcValidCLTab);
    2390           0 :         warper->useBandSrcValid = static_cast<char *>(useBandSrcValidTab[0]);
    2391           0 :         warper->useBandSrcValidCL = useBandSrcValidCLTab[0];
    2392           0 :         handleErrGoto(err, error_label);
    2393             : 
    2394           0 :         for (i = 0; i < warper->numBands; ++i)
    2395           0 :             warper->useBandSrcValid[i] = FALSE;
    2396             : 
    2397             :         // Allocate one array for all the band validity masks.
    2398             :         // Remember that the masks don't use much memory (they're bitwise).
    2399             :         void *nBandSrcValidTab[1];
    2400             :         cl_mem nBandSrcValidCLTab[1];
    2401           0 :         err = alloc_pinned_mem(warper, 0, sz * sizeof(int), nBandSrcValidTab,
    2402             :                                nBandSrcValidCLTab);
    2403           0 :         warper->nBandSrcValid = static_cast<float *>(nBandSrcValidTab[0]);
    2404           0 :         warper->nBandSrcValidCL = nBandSrcValidCLTab[0];
    2405           0 :         handleErrGoto(err, error_label);
    2406             :     }
    2407             : 
    2408             :     // Make space for the per-band
    2409           0 :     if (dfDstNoDataReal != nullptr)
    2410             :     {
    2411             :         void *fDstNoDataRealTab[1];
    2412             :         cl_mem fDstNoDataRealCLTab[1];
    2413           0 :         alloc_pinned_mem(warper, 0, warper->numBands, fDstNoDataRealTab,
    2414             :                          fDstNoDataRealCLTab);
    2415           0 :         warper->fDstNoDataReal = static_cast<float *>(fDstNoDataRealTab[0]);
    2416           0 :         warper->fDstNoDataRealCL = fDstNoDataRealCLTab[0];
    2417             : 
    2418             :         // Copy over values
    2419           0 :         for (i = 0; i < warper->numBands; ++i)
    2420           0 :             warper->fDstNoDataReal[i] = static_cast<float>(dfDstNoDataReal[i]);
    2421             :     }
    2422             : 
    2423             :     // Alloc working host image memory
    2424             :     // We'll be copying into these buffers soon
    2425           0 :     switch (imageFormat)
    2426             :     {
    2427           0 :         case CL_FLOAT:
    2428           0 :             err = alloc_working_arr(warper, sizeof(float *), sizeof(float),
    2429             :                                     &fmtSize);
    2430           0 :             break;
    2431           0 :         case CL_SNORM_INT8:
    2432           0 :             err = alloc_working_arr(warper, sizeof(char *), sizeof(char),
    2433             :                                     &fmtSize);
    2434           0 :             break;
    2435           0 :         case CL_UNORM_INT8:
    2436           0 :             err = alloc_working_arr(warper, sizeof(unsigned char *),
    2437             :                                     sizeof(unsigned char), &fmtSize);
    2438           0 :             break;
    2439           0 :         case CL_SNORM_INT16:
    2440           0 :             err = alloc_working_arr(warper, sizeof(short *), sizeof(short),
    2441             :                                     &fmtSize);
    2442           0 :             break;
    2443           0 :         case CL_UNORM_INT16:
    2444           0 :             err = alloc_working_arr(warper, sizeof(unsigned short *),
    2445             :                                     sizeof(unsigned short), &fmtSize);
    2446           0 :             break;
    2447             :     }
    2448           0 :     handleErrGoto(err, error_label);
    2449             : 
    2450             :     // Find a good & compatible image channel order for the Lat/Long array.
    2451           0 :     err = set_supported_formats(warper, 2, &(warper->xyChOrder),
    2452             :                                 &(warper->xyChSize), CL_FLOAT);
    2453           0 :     handleErrGoto(err, error_label);
    2454             : 
    2455             :     // Set coordinate image dimensions
    2456           0 :     warper->xyWidth =
    2457           0 :         static_cast<int>(ceil((static_cast<float>(warper->dstWidth) +
    2458           0 :                                static_cast<float>(warper->coordMult) - 1) /
    2459           0 :                               static_cast<float>(warper->coordMult)));
    2460           0 :     warper->xyHeight =
    2461           0 :         static_cast<int>(ceil((static_cast<float>(warper->dstHeight) +
    2462           0 :                                static_cast<float>(warper->coordMult) - 1) /
    2463           0 :                               static_cast<float>(warper->coordMult)));
    2464             : 
    2465             :     // Alloc coord memory
    2466           0 :     sz = sizeof(float) * warper->xyChSize * warper->xyWidth * warper->xyHeight;
    2467             :     void *xyWorkTab[1];
    2468             :     cl_mem xyWorkCLTab[1];
    2469           0 :     err = alloc_pinned_mem(warper, 0, sz, xyWorkTab, xyWorkCLTab);
    2470           0 :     warper->xyWork = static_cast<float *>(xyWorkTab[0]);
    2471           0 :     warper->xyWorkCL = xyWorkCLTab[0];
    2472           0 :     handleErrGoto(err, error_label);
    2473             : 
    2474             :     // Ensure everything is finished allocating, copying, & mapping
    2475           0 :     err = clFinish(warper->queue);
    2476           0 :     handleErrGoto(err, error_label);
    2477             : 
    2478           0 :     (*clErr) = CL_SUCCESS;
    2479           0 :     return warper;
    2480             : 
    2481           0 : error_label:
    2482           0 :     GDALWarpKernelOpenCL_deleteEnv(warper);
    2483           0 :     return nullptr;
    2484             : }
    2485             : 
    2486             : /*
    2487             :  Copy the validity mask for an image band to the warper.
    2488             : 
    2489             :  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
    2490             :  */
    2491           0 : cl_int GDALWarpKernelOpenCL_setSrcValid(struct oclWarper *warper,
    2492             :                                         int *bandSrcValid, int bandNum)
    2493             : {
    2494             :     // 32 bits in the mask
    2495           0 :     int stride = (31 + warper->srcWidth * warper->srcHeight) >> 5;
    2496             : 
    2497             :     // Copy bandSrcValid
    2498           0 :     assert(warper->nBandSrcValid != nullptr);
    2499           0 :     memcpy(&(warper->nBandSrcValid[bandNum * stride]), bandSrcValid,
    2500           0 :            sizeof(int) * stride);
    2501           0 :     warper->useBandSrcValid[bandNum] = TRUE;
    2502             : 
    2503           0 :     return CL_SUCCESS;
    2504             : }
    2505             : 
    2506             : /*
    2507             :  Sets the source image real & imag into the host memory so that it is
    2508             :  permuted (ex. RGBA) for better graphics card access.
    2509             : 
    2510             :  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
    2511             :  */
    2512           0 : cl_int GDALWarpKernelOpenCL_setSrcImg(struct oclWarper *warper, void *imgData,
    2513             :                                       int bandNum)
    2514             : {
    2515           0 :     void **imagWorkPtr = nullptr;
    2516             : 
    2517           0 :     if (warper->imagWorkCL != nullptr)
    2518           0 :         imagWorkPtr = warper->imagWork.v;
    2519             : 
    2520           0 :     return set_img_data(warper, imgData, warper->srcWidth, warper->srcHeight,
    2521           0 :                         TRUE, bandNum, warper->realWork.v, imagWorkPtr);
    2522             : }
    2523             : 
    2524             : /*
    2525             :  Sets the destination image real & imag into the host memory so that it is
    2526             :  permuted (ex. RGBA) for better graphics card access.
    2527             : 
    2528             :  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
    2529             :  */
    2530           0 : cl_int GDALWarpKernelOpenCL_setDstImg(struct oclWarper *warper, void *imgData,
    2531             :                                       int bandNum)
    2532             : {
    2533           0 :     void **dstImagWorkPtr = nullptr;
    2534             : 
    2535           0 :     if (warper->dstImagWorkCL != nullptr)
    2536           0 :         dstImagWorkPtr = warper->dstImagWork.v;
    2537             : 
    2538           0 :     return set_img_data(warper, imgData, warper->dstWidth, warper->dstHeight,
    2539           0 :                         FALSE, bandNum, warper->dstRealWork.v, dstImagWorkPtr);
    2540             : }
    2541             : 
    2542             : /*
    2543             :  Inputs the source coordinates for a row of the destination pixels. Invalid
    2544             :  coordinates are set as -99.0, which should be out of the image bounds. Sets
    2545             :  the coordinates as ready to be used in OpenCL image memory: interleaved and
    2546             :  minus the offset. By using image memory, we can use a smaller texture for
    2547             :  coordinates and use OpenCL's built-in interpolation to save memory.
    2548             : 
    2549             :  What it does: generates a smaller matrix of X/Y coordinate transformation
    2550             :  values from an original matrix. When bilinearly sampled in the GPU hardware,
    2551             :  the generated values are as close as possible to the original matrix.
    2552             : 
    2553             :  Complication: matrices have arbitrary dimensions and the sub-sampling factor
    2554             :  is an arbitrary integer greater than zero. Getting the edge cases right is
    2555             :  difficult.
    2556             : 
    2557             :  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
    2558             :  */
    2559           0 : cl_int GDALWarpKernelOpenCL_setCoordRow(struct oclWarper *warper,
    2560             :                                         double *rowSrcX, double *rowSrcY,
    2561             :                                         double srcXOff, double srcYOff,
    2562             :                                         int *success, int rowNum)
    2563             : {
    2564           0 :     int coordMult = warper->coordMult;
    2565           0 :     int width = warper->dstWidth;
    2566           0 :     int height = warper->dstHeight;
    2567           0 :     int xyWidth = warper->xyWidth;
    2568             :     int i;
    2569           0 :     int xyChSize = warper->xyChSize;
    2570           0 :     float *xyPtr, *xyPrevPtr = nullptr;
    2571           0 :     int lastRow = rowNum == height - 1;
    2572           0 :     double dstHeightMod = 1.0, dstWidthMod = 1.0;
    2573             : 
    2574             :     // Return if we're at an off row
    2575           0 :     if (!lastRow && rowNum % coordMult != 0)
    2576           0 :         return CL_SUCCESS;
    2577             : 
    2578             :     // Standard row, adjusted for the skipped rows
    2579           0 :     xyPtr = &(warper->xyWork[xyWidth * xyChSize * rowNum / coordMult]);
    2580             : 
    2581             :     // Find our row
    2582           0 :     if (lastRow)
    2583             :     {
    2584             :         // Setup for the final row
    2585           0 :         xyPtr = &(warper->xyWork[xyWidth * xyChSize * (warper->xyHeight - 1)]);
    2586           0 :         xyPrevPtr =
    2587           0 :             &(warper->xyWork[xyWidth * xyChSize * (warper->xyHeight - 2)]);
    2588             : 
    2589           0 :         if ((height - 1) % coordMult)
    2590           0 :             dstHeightMod = static_cast<double>(coordMult) /
    2591           0 :                            static_cast<double>((height - 1) % coordMult);
    2592             :     }
    2593             : 
    2594             :     // Copy selected coordinates
    2595           0 :     for (i = 0; i < width; i += coordMult)
    2596             :     {
    2597           0 :         if (success[i])
    2598             :         {
    2599           0 :             xyPtr[0] = static_cast<float>(rowSrcX[i] - srcXOff);
    2600           0 :             xyPtr[1] = static_cast<float>(rowSrcY[i] - srcYOff);
    2601             : 
    2602           0 :             if (lastRow)
    2603             :             {
    2604             :                 // Adjust bottom row so interpolator returns correct value
    2605           0 :                 xyPtr[0] = static_cast<float>(
    2606           0 :                     dstHeightMod * (xyPtr[0] - xyPrevPtr[0]) + xyPrevPtr[0]);
    2607           0 :                 xyPtr[1] = static_cast<float>(
    2608           0 :                     dstHeightMod * (xyPtr[1] - xyPrevPtr[1]) + xyPrevPtr[1]);
    2609             :             }
    2610             :         }
    2611             :         else
    2612             :         {
    2613           0 :             xyPtr[0] = -99.0f;
    2614           0 :             xyPtr[1] = -99.0f;
    2615             :         }
    2616             : 
    2617           0 :         xyPtr += xyChSize;
    2618           0 :         xyPrevPtr += xyChSize;
    2619             :     }
    2620             : 
    2621             :     // Copy remaining coordinate
    2622           0 :     if ((width - 1) % coordMult)
    2623             :     {
    2624           0 :         dstWidthMod = static_cast<double>(coordMult) /
    2625           0 :                       static_cast<double>((width - 1) % coordMult);
    2626           0 :         xyPtr -= xyChSize;
    2627           0 :         xyPrevPtr -= xyChSize;
    2628             :     }
    2629             :     else
    2630             :     {
    2631           0 :         xyPtr -= xyChSize * 2;
    2632           0 :         xyPrevPtr -= xyChSize * 2;
    2633             :     }
    2634             : 
    2635           0 :     if (lastRow)
    2636             :     {
    2637           0 :         double origX = rowSrcX[width - 1] - srcXOff;
    2638           0 :         double origY = rowSrcY[width - 1] - srcYOff;
    2639           0 :         double a = 1.0, b = 1.0;
    2640             : 
    2641             :         // Calculate the needed x/y values using an equation from the OpenCL
    2642             :         // Spec section 8.2, solving for Ti1j1
    2643           0 :         if ((width - 1) % coordMult)
    2644           0 :             a = ((width - 1) % coordMult) / static_cast<double>(coordMult);
    2645             : 
    2646           0 :         if ((height - 1) % coordMult)
    2647           0 :             b = ((height - 1) % coordMult) / static_cast<double>(coordMult);
    2648             : 
    2649           0 :         xyPtr[xyChSize] = static_cast<float>(
    2650           0 :             (((1.0 - a) * (1.0 - b) * xyPrevPtr[0] +
    2651           0 :               a * (1.0 - b) * xyPrevPtr[xyChSize] + (1.0 - a) * b * xyPtr[0]) -
    2652           0 :              origX) /
    2653           0 :             (-a * b));
    2654             : 
    2655           0 :         xyPtr[xyChSize + 1] =
    2656           0 :             static_cast<float>((((1.0 - a) * (1.0 - b) * xyPrevPtr[1] +
    2657           0 :                                  a * (1.0 - b) * xyPrevPtr[xyChSize + 1] +
    2658           0 :                                  (1.0 - a) * b * xyPtr[1]) -
    2659           0 :                                 origY) /
    2660           0 :                                (-a * b));
    2661             :     }
    2662             :     else
    2663             :     {
    2664             :         // Adjust last coordinate so interpolator returns correct value
    2665           0 :         xyPtr[xyChSize] = static_cast<float>(
    2666           0 :             dstWidthMod * (rowSrcX[width - 1] - srcXOff - xyPtr[0]) + xyPtr[0]);
    2667           0 :         xyPtr[xyChSize + 1] = static_cast<float>(
    2668           0 :             dstWidthMod * (rowSrcY[width - 1] - srcYOff - xyPtr[1]) + xyPtr[1]);
    2669             :     }
    2670             : 
    2671           0 :     return CL_SUCCESS;
    2672             : }
    2673             : 
    2674             : /*
    2675             :  Copies all data to the device RAM, frees the host RAM, runs the
    2676             :  appropriate resampling kernel, mallocs output space, & copies the data
    2677             :  back from the device RAM for each band. Also check to make sure that
    2678             :  setRow*() was called the appropriate number of times to init all image
    2679             :  data.
    2680             : 
    2681             :  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
    2682             :  */
    2683           0 : cl_int GDALWarpKernelOpenCL_runResamp(
    2684             :     struct oclWarper *warper, float *unifiedSrcDensity,
    2685             :     unsigned int *unifiedSrcValid, float *dstDensity, unsigned int *dstValid,
    2686             :     double dfXScale, double dfYScale, double dfXFilter, double dfYFilter,
    2687             :     int nXRadius, int nYRadius, int nFiltInitX, int nFiltInitY)
    2688             : {
    2689           0 :     int i, nextBandNum = 0, chSize = 1;
    2690           0 :     cl_int err = CL_SUCCESS;
    2691             :     cl_mem xy, unifiedSrcDensityCL, unifiedSrcValidCL;
    2692             :     cl_mem dstDensityCL, dstValidCL, dstNoDataRealCL;
    2693             :     cl_mem useBandSrcValidCL, nBandSrcValidCL;
    2694           0 :     size_t groupSize, wordSize = 0;
    2695           0 :     cl_kernel kern = nullptr;
    2696             :     cl_channel_order chOrder;
    2697             : 
    2698           0 :     warper->useUnifiedSrcDensity = unifiedSrcDensity != nullptr;
    2699           0 :     warper->useUnifiedSrcValid = unifiedSrcValid != nullptr;
    2700             : 
    2701             :     // Check the word size
    2702           0 :     switch (warper->imageFormat)
    2703             :     {
    2704           0 :         case CL_FLOAT:
    2705           0 :             wordSize = sizeof(float);
    2706           0 :             break;
    2707           0 :         case CL_SNORM_INT8:
    2708           0 :             wordSize = sizeof(char);
    2709           0 :             break;
    2710           0 :         case CL_UNORM_INT8:
    2711           0 :             wordSize = sizeof(unsigned char);
    2712           0 :             break;
    2713           0 :         case CL_SNORM_INT16:
    2714           0 :             wordSize = sizeof(short);
    2715           0 :             break;
    2716           0 :         case CL_UNORM_INT16:
    2717           0 :             wordSize = sizeof(unsigned short);
    2718           0 :             break;
    2719             :     }
    2720             : 
    2721             :     // Compile the kernel; the invariants are being compiled into the code
    2722           0 :     if (!warper->useVec || warper->numBands % 4)
    2723             :     {
    2724           0 :         warper->kern1 =
    2725           0 :             get_kernel(warper, FALSE, dfXScale, dfYScale, dfXFilter, dfYFilter,
    2726             :                        nXRadius, nYRadius, nFiltInitX, nFiltInitY, &err);
    2727           0 :         handleErr(err);
    2728             :     }
    2729           0 :     if (warper->useVec)
    2730             :     {
    2731           0 :         warper->kern4 =
    2732           0 :             get_kernel(warper, TRUE, dfXScale, dfYScale, dfXFilter, dfYFilter,
    2733             :                        nXRadius, nYRadius, nFiltInitX, nFiltInitY, &err);
    2734           0 :         handleErr(err);
    2735             :     }
    2736             : 
    2737             :     // Copy coord data to the device
    2738           0 :     handleErr(err = set_coord_data(warper, &xy));
    2739             : 
    2740             :     // Copy unified density & valid data
    2741           0 :     handleErr(err = set_unified_data(warper, &unifiedSrcDensityCL,
    2742             :                                      &unifiedSrcValidCL, unifiedSrcDensity,
    2743             :                                      unifiedSrcValid, &useBandSrcValidCL,
    2744             :                                      &nBandSrcValidCL));
    2745             : 
    2746             :     // Copy output density & valid data
    2747           0 :     handleErr(set_dst_data(warper, &dstDensityCL, &dstValidCL, &dstNoDataRealCL,
    2748             :                            dstDensity, dstValid, warper->fDstNoDataReal));
    2749             : 
    2750             :     // What's the recommended group size?
    2751           0 :     if (warper->useVec)
    2752             :     {
    2753             :         // Start with the vector kernel
    2754           0 :         handleErr(clGetKernelWorkGroupInfo(
    2755             :             warper->kern4, warper->dev, CL_KERNEL_WORK_GROUP_SIZE,
    2756             :             sizeof(size_t), &groupSize, nullptr));
    2757           0 :         kern = warper->kern4;
    2758           0 :         chSize = warper->imgChSize4;
    2759           0 :         chOrder = warper->imgChOrder4;
    2760             :     }
    2761             :     else
    2762             :     {
    2763             :         // We're only using the float kernel
    2764           0 :         handleErr(clGetKernelWorkGroupInfo(
    2765             :             warper->kern1, warper->dev, CL_KERNEL_WORK_GROUP_SIZE,
    2766             :             sizeof(size_t), &groupSize, nullptr));
    2767           0 :         kern = warper->kern1;
    2768           0 :         chSize = warper->imgChSize1;
    2769           0 :         chOrder = warper->imgChOrder1;
    2770             :     }
    2771             : 
    2772             :     // Loop over each image
    2773           0 :     for (i = 0; i < warper->numImages; ++i)
    2774             :     {
    2775             :         cl_mem srcImag, srcReal;
    2776             :         cl_mem dstReal, dstImag;
    2777           0 :         int bandNum = nextBandNum;
    2778             : 
    2779             :         // Switch kernels if needed
    2780           0 :         if (warper->useVec &&
    2781           0 :             nextBandNum < warper->numBands - warper->numBands % 4)
    2782             :         {
    2783           0 :             nextBandNum += 4;
    2784             :         }
    2785             :         else
    2786             :         {
    2787           0 :             if (kern == warper->kern4)
    2788             :             {
    2789           0 :                 handleErr(clGetKernelWorkGroupInfo(
    2790             :                     warper->kern1, warper->dev, CL_KERNEL_WORK_GROUP_SIZE,
    2791             :                     sizeof(size_t), &groupSize, nullptr));
    2792           0 :                 kern = warper->kern1;
    2793           0 :                 chSize = warper->imgChSize1;
    2794           0 :                 chOrder = warper->imgChOrder1;
    2795             :             }
    2796           0 :             ++nextBandNum;
    2797             :         }
    2798             : 
    2799             :         // Create & copy the source image
    2800           0 :         handleErr(err = set_src_rast_data(warper, i, chSize * wordSize, chOrder,
    2801             :                                           &srcReal, &srcImag));
    2802             : 
    2803             :         // Create & copy the output image
    2804           0 :         if (kern == warper->kern1)
    2805             :         {
    2806           0 :             handleErr(err = set_dst_rast_data(warper, i, wordSize, &dstReal,
    2807             :                                               &dstImag));
    2808             :         }
    2809             :         else
    2810             :         {
    2811           0 :             handleErr(err = set_dst_rast_data(warper, i, wordSize * 4, &dstReal,
    2812             :                                               &dstImag));
    2813             :         }
    2814             : 
    2815             :         // Set the bandNum
    2816           0 :         handleErr(err = clSetKernelArg(kern, 12, sizeof(int), &bandNum));
    2817             : 
    2818             :         // Run the kernel
    2819           0 :         handleErr(err = execute_kern(warper, kern, groupSize));
    2820             : 
    2821             :         // Free loop CL mem
    2822           0 :         handleErr(err = clReleaseMemObject(srcReal));
    2823           0 :         handleErr(err = clReleaseMemObject(srcImag));
    2824             : 
    2825             :         // Copy the back output results
    2826           0 :         if (kern == warper->kern1)
    2827             :         {
    2828           0 :             handleErr(
    2829             :                 err = get_dst_rast_data(warper, i, wordSize, dstReal, dstImag));
    2830             :         }
    2831             :         else
    2832             :         {
    2833           0 :             handleErr(err = get_dst_rast_data(warper, i, wordSize * 4, dstReal,
    2834             :                                               dstImag));
    2835             :         }
    2836             : 
    2837             :         // Free remaining CL mem
    2838           0 :         handleErr(err = clReleaseMemObject(dstReal));
    2839           0 :         handleErr(err = clReleaseMemObject(dstImag));
    2840             :     }
    2841             : 
    2842             :     // Free remaining CL mem
    2843           0 :     handleErr(err = clReleaseMemObject(xy));
    2844           0 :     handleErr(err = clReleaseMemObject(unifiedSrcDensityCL));
    2845           0 :     handleErr(err = clReleaseMemObject(unifiedSrcValidCL));
    2846           0 :     handleErr(err = clReleaseMemObject(useBandSrcValidCL));
    2847           0 :     handleErr(err = clReleaseMemObject(nBandSrcValidCL));
    2848           0 :     handleErr(err = clReleaseMemObject(dstDensityCL));
    2849           0 :     handleErr(err = clReleaseMemObject(dstValidCL));
    2850           0 :     handleErr(err = clReleaseMemObject(dstNoDataRealCL));
    2851             : 
    2852           0 :     return CL_SUCCESS;
    2853             : }
    2854             : 
    2855             : /*
    2856             :  Sets pointers to the floating point data in the warper. The pointers
    2857             :  are internal to the warper structure, so don't free() them. If the imag
    2858             :  channel is in use, it will receive a pointer. Otherwise it'll be set to NULL.
    2859             :  These are pointers to floating point data, so the caller will need to
    2860             :  manipulate the output as appropriate before saving the data.
    2861             : 
    2862             :  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
    2863             :  */
    2864           0 : cl_int GDALWarpKernelOpenCL_getRow(struct oclWarper *warper, void **rowReal,
    2865             :                                    void **rowImag, int rowNum, int bandNum)
    2866             : {
    2867           0 :     int memOff = rowNum * warper->dstWidth;
    2868           0 :     int imgNum = bandNum;
    2869             : 
    2870           0 :     if (warper->useVec && bandNum < warper->numBands - warper->numBands % 4)
    2871             :     {
    2872           0 :         memOff += warper->dstWidth * warper->dstHeight * (bandNum % 4);
    2873           0 :         imgNum = bandNum / 4;
    2874             :     }
    2875           0 :     else if (warper->useVec)
    2876             :     {
    2877           0 :         imgNum = bandNum / 4 + bandNum % 4;
    2878             :     }
    2879             : 
    2880             :     // Return pointers into the warper's data
    2881           0 :     switch (warper->imageFormat)
    2882             :     {
    2883           0 :         case CL_FLOAT:
    2884           0 :             (*rowReal) = &(warper->dstRealWork.f[imgNum][memOff]);
    2885           0 :             break;
    2886           0 :         case CL_SNORM_INT8:
    2887           0 :             (*rowReal) = &(warper->dstRealWork.c[imgNum][memOff]);
    2888           0 :             break;
    2889           0 :         case CL_UNORM_INT8:
    2890           0 :             (*rowReal) = &(warper->dstRealWork.uc[imgNum][memOff]);
    2891           0 :             break;
    2892           0 :         case CL_SNORM_INT16:
    2893           0 :             (*rowReal) = &(warper->dstRealWork.s[imgNum][memOff]);
    2894           0 :             break;
    2895           0 :         case CL_UNORM_INT16:
    2896           0 :             (*rowReal) = &(warper->dstRealWork.us[imgNum][memOff]);
    2897           0 :             break;
    2898             :     }
    2899             : 
    2900           0 :     if (warper->dstImagWorkCL == nullptr)
    2901             :     {
    2902           0 :         (*rowImag) = nullptr;
    2903             :     }
    2904             :     else
    2905             :     {
    2906           0 :         switch (warper->imageFormat)
    2907             :         {
    2908           0 :             case CL_FLOAT:
    2909           0 :                 (*rowImag) = &(warper->dstImagWork.f[imgNum][memOff]);
    2910           0 :                 break;
    2911           0 :             case CL_SNORM_INT8:
    2912           0 :                 (*rowImag) = &(warper->dstImagWork.c[imgNum][memOff]);
    2913           0 :                 break;
    2914           0 :             case CL_UNORM_INT8:
    2915           0 :                 (*rowImag) = &(warper->dstImagWork.uc[imgNum][memOff]);
    2916           0 :                 break;
    2917           0 :             case CL_SNORM_INT16:
    2918           0 :                 (*rowImag) = &(warper->dstImagWork.s[imgNum][memOff]);
    2919           0 :                 break;
    2920           0 :             case CL_UNORM_INT16:
    2921           0 :                 (*rowImag) = &(warper->dstImagWork.us[imgNum][memOff]);
    2922           0 :                 break;
    2923             :         }
    2924             :     }
    2925             : 
    2926           0 :     return CL_SUCCESS;
    2927             : }
    2928             : 
    2929             : /*
    2930             :  Free the OpenCL warper environment. It should check everything for NULL, so
    2931             :  be sure to mark free()ed pointers as NULL or it'll be double free()ed.
    2932             : 
    2933             :  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
    2934             :  */
    2935           0 : cl_int GDALWarpKernelOpenCL_deleteEnv(struct oclWarper *warper)
    2936             : {
    2937             :     int i;
    2938           0 :     cl_int err = CL_SUCCESS;
    2939             : 
    2940           0 :     for (i = 0; i < warper->numImages; ++i)
    2941             :     {
    2942             :         // Run free!!
    2943           0 :         void *dummy = nullptr;
    2944           0 :         if (warper->realWork.v)
    2945           0 :             freeCLMem(warper->realWorkCL[i], warper->realWork.v[i]);
    2946             :         else
    2947           0 :             freeCLMem(warper->realWorkCL[i], dummy);
    2948           0 :         if (warper->realWork.v)
    2949           0 :             freeCLMem(warper->dstRealWorkCL[i], warper->dstRealWork.v[i]);
    2950             :         else
    2951           0 :             freeCLMem(warper->dstRealWorkCL[i], dummy);
    2952             : 
    2953             :         //(As applicable)
    2954           0 :         if (warper->imagWorkCL != nullptr && warper->imagWork.v != nullptr &&
    2955           0 :             warper->imagWork.v[i] != nullptr)
    2956             :         {
    2957           0 :             freeCLMem(warper->imagWorkCL[i], warper->imagWork.v[i]);
    2958             :         }
    2959           0 :         if (warper->dstImagWorkCL != nullptr &&
    2960           0 :             warper->dstImagWork.v != nullptr &&
    2961           0 :             warper->dstImagWork.v[i] != nullptr)
    2962             :         {
    2963           0 :             freeCLMem(warper->dstImagWorkCL[i], warper->dstImagWork.v[i]);
    2964             :         }
    2965             :     }
    2966             : 
    2967             :     // Free cl_mem
    2968           0 :     freeCLMem(warper->useBandSrcValidCL, warper->useBandSrcValid);
    2969           0 :     freeCLMem(warper->nBandSrcValidCL, warper->nBandSrcValid);
    2970           0 :     freeCLMem(warper->xyWorkCL, warper->xyWork);
    2971           0 :     freeCLMem(warper->fDstNoDataRealCL, warper->fDstNoDataReal);
    2972             : 
    2973             :     // Free pointers to cl_mem*
    2974           0 :     if (warper->realWorkCL != nullptr)
    2975           0 :         CPLFree(warper->realWorkCL);
    2976           0 :     if (warper->dstRealWorkCL != nullptr)
    2977           0 :         CPLFree(warper->dstRealWorkCL);
    2978             : 
    2979           0 :     if (warper->imagWorkCL != nullptr)
    2980           0 :         CPLFree(warper->imagWorkCL);
    2981           0 :     if (warper->dstImagWorkCL != nullptr)
    2982           0 :         CPLFree(warper->dstImagWorkCL);
    2983             : 
    2984           0 :     if (warper->realWork.v != nullptr)
    2985           0 :         CPLFree(warper->realWork.v);
    2986           0 :     if (warper->dstRealWork.v != nullptr)
    2987           0 :         CPLFree(warper->dstRealWork.v);
    2988             : 
    2989           0 :     if (warper->imagWork.v != nullptr)
    2990           0 :         CPLFree(warper->imagWork.v);
    2991           0 :     if (warper->dstImagWork.v != nullptr)
    2992           0 :         CPLFree(warper->dstImagWork.v);
    2993             : 
    2994             :     // Free OpenCL structures
    2995           0 :     if (warper->kern1 != nullptr)
    2996           0 :         clReleaseKernel(warper->kern1);
    2997           0 :     if (warper->kern4 != nullptr)
    2998           0 :         clReleaseKernel(warper->kern4);
    2999           0 :     if (warper->queue != nullptr)
    3000           0 :         clReleaseCommandQueue(warper->queue);
    3001           0 :     if (warper->context != nullptr)
    3002           0 :         clReleaseContext(warper->context);
    3003             : 
    3004           0 :     CPLFree(warper);
    3005             : 
    3006           0 :     return CL_SUCCESS;
    3007             : }
    3008             : 
    3009             : #endif /* defined(HAVE_OPENCL) */

Generated by: LCOV version 1.14