LCOV - code coverage report
Current view: top level - alg - gdalgridavx.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 63 65 96.9 %
Date: 2025-06-19 12:30:01 Functions: 1 1 100.0 %

          Line data    Source code
       1             : /******************************************************************************
       2             :  *
       3             :  * Project:  GDAL Gridding API.
       4             :  * Purpose:  Implementation of GDAL scattered data gridder.
       5             :  * Author:   Even Rouault, <even dot rouault at spatialys.com>
       6             :  *
       7             :  ******************************************************************************
       8             :  * Copyright (c) 2013, Even Rouault <even dot rouault at spatialys.com>
       9             :  *
      10             :  * SPDX-License-Identifier: MIT
      11             :  ****************************************************************************/
      12             : 
      13             : #include "gdalgrid.h"
      14             : #include "gdalgrid_priv.h"
      15             : 
      16             : #ifdef HAVE_AVX_AT_COMPILE_TIME
      17             : #include <immintrin.h>
      18             : 
      19             : /************************************************************************/
      20             : /*         GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX()     */
      21             : /************************************************************************/
      22             : 
      23     1221500 : CPLErr GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX(
      24             :     const void *poOptions, GUInt32 nPoints,
      25             :     CPL_UNUSED const double *unused_padfX,
      26             :     CPL_UNUSED const double *unused_padfY,
      27             :     CPL_UNUSED const double *unused_padfZ, double dfXPoint, double dfYPoint,
      28             :     double *pdfValue, void *hExtraParamsIn)
      29             : {
      30     1221500 :     size_t i = 0;
      31     1221500 :     GDALGridExtraParameters *psExtraParams =
      32             :         static_cast<GDALGridExtraParameters *>(hExtraParamsIn);
      33     1221500 :     const float *pafX = psExtraParams->pafX;
      34     1221500 :     const float *pafY = psExtraParams->pafY;
      35     1221500 :     const float *pafZ = psExtraParams->pafZ;
      36             : 
      37     1221500 :     const float fEpsilon = 0.0000000000001f;
      38     1221500 :     const float fXPoint = static_cast<float>(dfXPoint);
      39     1221500 :     const float fYPoint = static_cast<float>(dfYPoint);
      40     1221500 :     const __m256 ymm_small = _mm256_set1_ps(fEpsilon);
      41     1221500 :     const __m256 ymm_x = _mm256_set1_ps(fXPoint);
      42     1221500 :     const __m256 ymm_y = _mm256_set1_ps(fYPoint);
      43     1221500 :     __m256 ymm_nominator = _mm256_setzero_ps();
      44     1221500 :     __m256 ymm_denominator = _mm256_setzero_ps();
      45     1221500 :     int mask = 0;
      46             : 
      47             : #undef LOOP_SIZE
      48             : #if defined(__x86_64) || defined(_M_X64)
      49             :     /* This would also work in 32bit mode, but there are only 8 XMM registers */
      50             :     /* whereas we have 16 for 64bit */
      51             : #define LOOP_SIZE 16
      52     1221500 :     size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
      53     1235570 :     for (i = 0; i < nPointsRound; i += LOOP_SIZE)
      54             :     {
      55       30568 :         __m256 ymm_rx = _mm256_sub_ps(_mm256_load_ps(pafX + i),
      56             :                                       ymm_x); /* rx = pafX[i] - fXPoint */
      57       30568 :         __m256 ymm_rx_8 = _mm256_sub_ps(_mm256_load_ps(pafX + i + 8), ymm_x);
      58       30568 :         __m256 ymm_ry = _mm256_sub_ps(_mm256_load_ps(pafY + i),
      59             :                                       ymm_y); /* ry = pafY[i] - fYPoint */
      60       45852 :         __m256 ymm_ry_8 = _mm256_sub_ps(_mm256_load_ps(pafY + i + 8), ymm_y);
      61       45852 :         __m256 ymm_r2 = _mm256_add_ps(
      62             :             _mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */
      63             :             _mm256_mul_ps(ymm_ry, ymm_ry));
      64       45852 :         __m256 ymm_r2_8 = _mm256_add_ps(_mm256_mul_ps(ymm_rx_8, ymm_rx_8),
      65             :                                         _mm256_mul_ps(ymm_ry_8, ymm_ry_8));
      66       15284 :         __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */
      67       15284 :         __m256 ymm_invr2_8 = _mm256_rcp_ps(ymm_r2_8);
      68             :         ymm_nominator =
      69       30568 :             _mm256_add_ps(ymm_nominator, /* nominator += invr2 * pafZ[i] */
      70       15284 :                           _mm256_mul_ps(ymm_invr2, _mm256_load_ps(pafZ + i)));
      71       45852 :         ymm_nominator = _mm256_add_ps(
      72             :             ymm_nominator,
      73       15284 :             _mm256_mul_ps(ymm_invr2_8, _mm256_load_ps(pafZ + i + 8)));
      74       15284 :         ymm_denominator = _mm256_add_ps(ymm_denominator,
      75             :                                         ymm_invr2); /* denominator += invr2 */
      76       15284 :         ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2_8);
      77       15271 :         mask =
      78       15284 :             _mm256_movemask_ps(_mm256_cmp_ps(
      79             :                 ymm_r2, ymm_small, _CMP_LT_OS)) | /* if( r2 < fEpsilon) */
      80       15282 :             (_mm256_movemask_ps(_mm256_cmp_ps(ymm_r2_8, ymm_small, _CMP_LT_OS))
      81       15271 :              << 8);
      82       15271 :         if (mask)
      83        1199 :             break;
      84             :     }
      85             : #else
      86             : #define LOOP_SIZE 8
      87             :     size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
      88             :     for (i = 0; i < nPointsRound; i += LOOP_SIZE)
      89             :     {
      90             :         __m256 ymm_rx =
      91             :             _mm256_sub_ps(_mm256_load_ps(const_cast<float *>(pafX) + i),
      92             :                           ymm_x); /* rx = pafX[i] - fXPoint */
      93             :         __m256 ymm_ry =
      94             :             _mm256_sub_ps(_mm256_load_ps(const_cast<float *>(pafY) + i),
      95             :                           ymm_y); /* ry = pafY[i] - fYPoint */
      96             :         __m256 ymm_r2 = _mm256_add_ps(
      97             :             _mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */
      98             :             _mm256_mul_ps(ymm_ry, ymm_ry));
      99             :         __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */
     100             :         ymm_nominator = _mm256_add_ps(
     101             :             ymm_nominator, /* nominator += invr2 * pafZ[i] */
     102             :             _mm256_mul_ps(ymm_invr2,
     103             :                           _mm256_load_ps(const_cast<float *>(pafZ) + i)));
     104             :         ymm_denominator = _mm256_add_ps(ymm_denominator,
     105             :                                         ymm_invr2); /* denominator += invr2 */
     106             :         mask = _mm256_movemask_ps(_mm256_cmp_ps(
     107             :             ymm_r2, ymm_small, _CMP_LT_OS)); /* if( r2 < fEpsilon) */
     108             :         if (mask)
     109             :             break;
     110             :     }
     111             : #endif
     112             : 
     113             :     // Find which i triggered r2 < fEpsilon.
     114     1221490 :     if (mask)
     115             :     {
     116       10151 :         for (int j = 0; j < LOOP_SIZE; j++)
     117             :         {
     118       10148 :             if (mask & (1 << j))
     119             :             {
     120        1196 :                 (*pdfValue) = (pafZ)[i + j];
     121             : 
     122        1196 :                 return CE_None;
     123             :             }
     124             :         }
     125             :     }
     126             : #undef LOOP_SIZE
     127             : 
     128             :     // Get back nominator and denominator values for YMM registers.
     129             :     float afNominator[8];
     130             :     float afDenominator[8];
     131             :     _mm256_storeu_ps(afNominator, ymm_nominator);
     132             :     _mm256_storeu_ps(afDenominator, ymm_denominator);
     133             : 
     134             :     // MSVC doesn't emit AVX afterwards but may use SSE, so clear
     135             :     // upper bits.  Other compilers will continue using AVX for the
     136             :     // below floating points operations.
     137             : #if defined(_MSC_FULL_VER)
     138             :     _mm256_zeroupper();
     139             : #endif
     140             : 
     141     1220290 :     float fNominator = afNominator[0] + afNominator[1] + afNominator[2] +
     142     1220290 :                        afNominator[3] + afNominator[4] + afNominator[5] +
     143     1220290 :                        afNominator[6] + afNominator[7];
     144     1220290 :     float fDenominator = afDenominator[0] + afDenominator[1] +
     145     1220290 :                          afDenominator[2] + afDenominator[3] +
     146     1220290 :                          afDenominator[4] + afDenominator[5] +
     147     1220290 :                          afDenominator[6] + afDenominator[7];
     148             : 
     149             :     // Do the few remaining loop iterations.
     150     6703850 :     for (; i < nPoints; i++)
     151             :     {
     152     5483580 :         const float fRX = pafX[i] - fXPoint;
     153     5483580 :         const float fRY = pafY[i] - fYPoint;
     154     5483580 :         const float fR2 = fRX * fRX + fRY * fRY;
     155             : 
     156             :         // If the test point is close to the grid node, use the point
     157             :         // value directly as a node value to avoid singularity.
     158     5483580 :         if (fR2 < 0.0000000000001)
     159             :         {
     160          15 :             break;
     161             :         }
     162             :         else
     163             :         {
     164     5483560 :             const float fInvR2 = 1.0f / fR2;
     165     5483560 :             fNominator += fInvR2 * pafZ[i];
     166     5483560 :             fDenominator += fInvR2;
     167             :         }
     168             :     }
     169             : 
     170     1220290 :     if (i != nPoints)
     171             :     {
     172          15 :         (*pdfValue) = pafZ[i];
     173             :     }
     174     1220280 :     else if (fDenominator == 0.0)
     175             :     {
     176           0 :         (*pdfValue) =
     177             :             static_cast<const GDALGridInverseDistanceToAPowerOptions *>(
     178             :                 poOptions)
     179           0 :                 ->dfNoDataValue;
     180             :     }
     181             :     else
     182     1220280 :         (*pdfValue) = fNominator / fDenominator;
     183             : 
     184     1220290 :     return CE_None;
     185             : }
     186             : 
     187             : #endif /* HAVE_AVX_AT_COMPILE_TIME */

Generated by: LCOV version 1.14