LCOV - code coverage report
Current view: top level - alg - gdalgridavx.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 53 65 81.5 %
Date: 2024-11-21 22:18:42 Functions: 1 1 100.0 %

          Line data    Source code
       1             : /******************************************************************************
       2             :  *
       3             :  * Project:  GDAL Gridding API.
       4             :  * Purpose:  Implementation of GDAL scattered data gridder.
       5             :  * Author:   Even Rouault, <even dot rouault at spatialys.com>
       6             :  *
       7             :  ******************************************************************************
       8             :  * Copyright (c) 2013, Even Rouault <even dot rouault at spatialys.com>
       9             :  *
      10             :  * SPDX-License-Identifier: MIT
      11             :  ****************************************************************************/
      12             : 
      13             : #include "gdalgrid.h"
      14             : #include "gdalgrid_priv.h"
      15             : 
      16             : #ifdef HAVE_AVX_AT_COMPILE_TIME
      17             : #include <immintrin.h>
      18             : 
      19             : /************************************************************************/
      20             : /*         GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX()     */
      21             : /************************************************************************/
      22             : 
      23             : #define GDAL_mm256_load1_ps(x) _mm256_set_ps(x, x, x, x, x, x, x, x)
      24             : 
      25        1323 : CPLErr GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX(
      26             :     const void *poOptions, GUInt32 nPoints,
      27             :     CPL_UNUSED const double *unused_padfX,
      28             :     CPL_UNUSED const double *unused_padfY,
      29             :     CPL_UNUSED const double *unused_padfZ, double dfXPoint, double dfYPoint,
      30             :     double *pdfValue, void *hExtraParamsIn)
      31             : {
      32        1323 :     size_t i = 0;
      33        1323 :     GDALGridExtraParameters *psExtraParams =
      34             :         static_cast<GDALGridExtraParameters *>(hExtraParamsIn);
      35        1323 :     const float *pafX = psExtraParams->pafX;
      36        1323 :     const float *pafY = psExtraParams->pafY;
      37        1323 :     const float *pafZ = psExtraParams->pafZ;
      38             : 
      39        1323 :     const float fEpsilon = 0.0000000000001f;
      40        1323 :     const float fXPoint = static_cast<float>(dfXPoint);
      41        1323 :     const float fYPoint = static_cast<float>(dfYPoint);
      42        1323 :     const __m256 ymm_small = GDAL_mm256_load1_ps(fEpsilon);
      43        1323 :     const __m256 ymm_x = GDAL_mm256_load1_ps(fXPoint);
      44        1323 :     const __m256 ymm_y = GDAL_mm256_load1_ps(fYPoint);
      45        1323 :     __m256 ymm_nominator = _mm256_setzero_ps();
      46        1323 :     __m256 ymm_denominator = _mm256_setzero_ps();
      47        1323 :     int mask = 0;
      48             : 
      49             : #undef LOOP_SIZE
      50             : #if defined(__x86_64) || defined(_M_X64)
      51             :     /* This would also work in 32bit mode, but there are only 8 XMM registers */
      52             :     /* whereas we have 16 for 64bit */
      53             : #define LOOP_SIZE 16
      54        1323 :     size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
      55       14554 :     for (i = 0; i < nPointsRound; i += LOOP_SIZE)
      56             :     {
      57       29166 :         __m256 ymm_rx = _mm256_sub_ps(_mm256_load_ps(pafX + i),
      58             :                                       ymm_x); /* rx = pafX[i] - fXPoint */
      59       29166 :         __m256 ymm_rx_8 = _mm256_sub_ps(_mm256_load_ps(pafX + i + 8), ymm_x);
      60       29166 :         __m256 ymm_ry = _mm256_sub_ps(_mm256_load_ps(pafY + i),
      61             :                                       ymm_y); /* ry = pafY[i] - fYPoint */
      62       43749 :         __m256 ymm_ry_8 = _mm256_sub_ps(_mm256_load_ps(pafY + i + 8), ymm_y);
      63       43749 :         __m256 ymm_r2 = _mm256_add_ps(
      64             :             _mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */
      65             :             _mm256_mul_ps(ymm_ry, ymm_ry));
      66       43749 :         __m256 ymm_r2_8 = _mm256_add_ps(_mm256_mul_ps(ymm_rx_8, ymm_rx_8),
      67             :                                         _mm256_mul_ps(ymm_ry_8, ymm_ry_8));
      68       14583 :         __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */
      69       14583 :         __m256 ymm_invr2_8 = _mm256_rcp_ps(ymm_r2_8);
      70             :         ymm_nominator =
      71       29166 :             _mm256_add_ps(ymm_nominator, /* nominator += invr2 * pafZ[i] */
      72       14583 :                           _mm256_mul_ps(ymm_invr2, _mm256_load_ps(pafZ + i)));
      73       43749 :         ymm_nominator = _mm256_add_ps(
      74             :             ymm_nominator,
      75       14583 :             _mm256_mul_ps(ymm_invr2_8, _mm256_load_ps(pafZ + i + 8)));
      76       14583 :         ymm_denominator = _mm256_add_ps(ymm_denominator,
      77             :                                         ymm_invr2); /* denominator += invr2 */
      78       14583 :         ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2_8);
      79       14431 :         mask =
      80       14583 :             _mm256_movemask_ps(_mm256_cmp_ps(
      81             :                 ymm_r2, ymm_small, _CMP_LT_OS)) | /* if( r2 < fEpsilon) */
      82       14400 :             (_mm256_movemask_ps(_mm256_cmp_ps(ymm_r2_8, ymm_small, _CMP_LT_OS))
      83       14431 :              << 8);
      84       14431 :         if (mask)
      85        1200 :             break;
      86             :     }
      87             : #else
      88             : #define LOOP_SIZE 8
      89             :     size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
      90             :     for (i = 0; i < nPointsRound; i += LOOP_SIZE)
      91             :     {
      92             :         __m256 ymm_rx =
      93             :             _mm256_sub_ps(_mm256_load_ps(const_cast<float *>(pafX) + i),
      94             :                           ymm_x); /* rx = pafX[i] - fXPoint */
      95             :         __m256 ymm_ry =
      96             :             _mm256_sub_ps(_mm256_load_ps(const_cast<float *>(pafY) + i),
      97             :                           ymm_y); /* ry = pafY[i] - fYPoint */
      98             :         __m256 ymm_r2 = _mm256_add_ps(
      99             :             _mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */
     100             :             _mm256_mul_ps(ymm_ry, ymm_ry));
     101             :         __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */
     102             :         ymm_nominator = _mm256_add_ps(
     103             :             ymm_nominator, /* nominator += invr2 * pafZ[i] */
     104             :             _mm256_mul_ps(ymm_invr2,
     105             :                           _mm256_load_ps(const_cast<float *>(pafZ) + i)));
     106             :         ymm_denominator = _mm256_add_ps(ymm_denominator,
     107             :                                         ymm_invr2); /* denominator += invr2 */
     108             :         mask = _mm256_movemask_ps(_mm256_cmp_ps(
     109             :             ymm_r2, ymm_small, _CMP_LT_OS)); /* if( r2 < fEpsilon) */
     110             :         if (mask)
     111             :             break;
     112             :     }
     113             : #endif
     114             : 
     115             :     // Find which i triggered r2 < fEpsilon.
     116        1171 :     if (mask)
     117             :     {
     118       10133 :         for (int j = 0; j < LOOP_SIZE; j++)
     119             :         {
     120       10132 :             if (mask & (1 << j))
     121             :             {
     122        1199 :                 (*pdfValue) = (pafZ)[i + j];
     123             : 
     124             :                 // GCC and MSVC need explicit zeroing.
     125             : #if !defined(__clang__)
     126             :                 _mm256_zeroupper();
     127             : #endif
     128        1200 :                 return CE_None;
     129             :             }
     130             :         }
     131             :     }
     132             : #undef LOOP_SIZE
     133             : 
     134             :     // Get back nominator and denominator values for YMM registers.
     135             :     float afNominator[8];
     136             :     float afDenominator[8];
     137             :     _mm256_storeu_ps(afNominator, ymm_nominator);
     138             :     _mm256_storeu_ps(afDenominator, ymm_denominator);
     139             : 
     140             :     // MSVC doesn't emit AVX afterwards but may use SSE, so clear
     141             :     // upper bits.  Other compilers will continue using AVX for the
     142             :     // below floating points operations.
     143             : #if defined(_MSC_FULL_VER)
     144             :     _mm256_zeroupper();
     145             : #endif
     146             : 
     147           0 :     float fNominator = afNominator[0] + afNominator[1] + afNominator[2] +
     148           0 :                        afNominator[3] + afNominator[4] + afNominator[5] +
     149           0 :                        afNominator[6] + afNominator[7];
     150           0 :     float fDenominator = afDenominator[0] + afDenominator[1] +
     151           0 :                          afDenominator[2] + afDenominator[3] +
     152           0 :                          afDenominator[4] + afDenominator[5] +
     153           0 :                          afDenominator[6] + afDenominator[7];
     154             : 
     155             :     // Do the few remaining loop iterations.
     156         139 :     for (; i < nPoints; i++)
     157             :     {
     158         182 :         const float fRX = pafX[i] - fXPoint;
     159         182 :         const float fRY = pafY[i] - fYPoint;
     160         182 :         const float fR2 = fRX * fRX + fRY * fRY;
     161             : 
     162             :         // If the test point is close to the grid node, use the point
     163             :         // value directly as a node value to avoid singularity.
     164         182 :         if (fR2 < 0.0000000000001)
     165             :         {
     166          15 :             break;
     167             :         }
     168             :         else
     169             :         {
     170         167 :             const float fInvR2 = 1.0f / fR2;
     171         167 :             fNominator += fInvR2 * pafZ[i];
     172         167 :             fDenominator += fInvR2;
     173             :         }
     174             :     }
     175             : 
     176           0 :     if (i != nPoints)
     177             :     {
     178          15 :         (*pdfValue) = pafZ[i];
     179             :     }
     180           0 :     else if (fDenominator == 0.0)
     181             :     {
     182           0 :         (*pdfValue) =
     183             :             static_cast<const GDALGridInverseDistanceToAPowerOptions *>(
     184             :                 poOptions)
     185           0 :                 ->dfNoDataValue;
     186             :     }
     187             :     else
     188           0 :         (*pdfValue) = fNominator / fDenominator;
     189             : 
     190             :         // GCC needs explicit zeroing.
     191             : #if defined(__GNUC__) && !defined(__clang__)
     192             :     _mm256_zeroupper();
     193             : #endif
     194             : 
     195         125 :     return CE_None;
     196             : }
     197             : 
     198             : #endif /* HAVE_AVX_AT_COMPILE_TIME */

Generated by: LCOV version 1.14