LCOV - code coverage report
Current view: top level - alg - gdalgridavx.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 63 65 96.9 %
Date: 2024-05-04 12:52:34 Functions: 1 1 100.0 %

          Line data    Source code
       1             : /******************************************************************************
       2             :  *
       3             :  * Project:  GDAL Gridding API.
       4             :  * Purpose:  Implementation of GDAL scattered data gridder.
       5             :  * Author:   Even Rouault, <even dot rouault at spatialys.com>
       6             :  *
       7             :  ******************************************************************************
       8             :  * Copyright (c) 2013, Even Rouault <even dot rouault at spatialys.com>
       9             :  *
      10             :  * Permission is hereby granted, free of charge, to any person obtaining a
      11             :  * copy of this software and associated documentation files (the "Software"),
      12             :  * to deal in the Software without restriction, including without limitation
      13             :  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      14             :  * and/or sell copies of the Software, and to permit persons to whom the
      15             :  * Software is furnished to do so, subject to the following conditions:
      16             :  *
      17             :  * The above copyright notice and this permission notice shall be included
      18             :  * in all copies or substantial portions of the Software.
      19             :  *
      20             :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
      21             :  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
      22             :  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
      23             :  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
      24             :  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
      25             :  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
      26             :  * DEALINGS IN THE SOFTWARE.
      27             :  ****************************************************************************/
      28             : 
      29             : #include "gdalgrid.h"
      30             : #include "gdalgrid_priv.h"
      31             : 
      32             : #ifdef HAVE_AVX_AT_COMPILE_TIME
      33             : #include <immintrin.h>
      34             : 
      35             : /************************************************************************/
      36             : /*         GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX()     */
      37             : /************************************************************************/
      38             : 
      39             : #define GDAL_mm256_load1_ps(x) _mm256_set_ps(x, x, x, x, x, x, x, x)
      40             : 
      41        1320 : CPLErr GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX(
      42             :     const void *poOptions, GUInt32 nPoints,
      43             :     CPL_UNUSED const double *unused_padfX,
      44             :     CPL_UNUSED const double *unused_padfY,
      45             :     CPL_UNUSED const double *unused_padfZ, double dfXPoint, double dfYPoint,
      46             :     double *pdfValue, void *hExtraParamsIn)
      47             : {
      48        1320 :     size_t i = 0;
      49        1320 :     GDALGridExtraParameters *psExtraParams =
      50             :         static_cast<GDALGridExtraParameters *>(hExtraParamsIn);
      51        1320 :     const float *pafX = psExtraParams->pafX;
      52        1320 :     const float *pafY = psExtraParams->pafY;
      53        1320 :     const float *pafZ = psExtraParams->pafZ;
      54             : 
      55        1320 :     const float fEpsilon = 0.0000000000001f;
      56        1320 :     const float fXPoint = static_cast<float>(dfXPoint);
      57        1320 :     const float fYPoint = static_cast<float>(dfYPoint);
      58        1320 :     const __m256 ymm_small = GDAL_mm256_load1_ps(fEpsilon);
      59        1320 :     const __m256 ymm_x = GDAL_mm256_load1_ps(fXPoint);
      60        1320 :     const __m256 ymm_y = GDAL_mm256_load1_ps(fYPoint);
      61        1320 :     __m256 ymm_nominator = _mm256_setzero_ps();
      62        1320 :     __m256 ymm_denominator = _mm256_setzero_ps();
      63        1320 :     int mask = 0;
      64             : 
      65             : #undef LOOP_SIZE
      66             : #if defined(__x86_64) || defined(_M_X64)
      67             :     /* This would also work in 32bit mode, but there are only 8 XMM registers */
      68             :     /* whereas we have 16 for 64bit */
      69             : #define LOOP_SIZE 16
      70        1320 :     size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
      71       15715 :     for (i = 0; i < nPointsRound; i += LOOP_SIZE)
      72             :     {
      73       31102 :         __m256 ymm_rx = _mm256_sub_ps(_mm256_load_ps(pafX + i),
      74             :                                       ymm_x); /* rx = pafX[i] - fXPoint */
      75       31102 :         __m256 ymm_rx_8 = _mm256_sub_ps(_mm256_load_ps(pafX + i + 8), ymm_x);
      76       31102 :         __m256 ymm_ry = _mm256_sub_ps(_mm256_load_ps(pafY + i),
      77             :                                       ymm_y); /* ry = pafY[i] - fYPoint */
      78       46653 :         __m256 ymm_ry_8 = _mm256_sub_ps(_mm256_load_ps(pafY + i + 8), ymm_y);
      79       46653 :         __m256 ymm_r2 = _mm256_add_ps(
      80             :             _mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */
      81             :             _mm256_mul_ps(ymm_ry, ymm_ry));
      82       46653 :         __m256 ymm_r2_8 = _mm256_add_ps(_mm256_mul_ps(ymm_rx_8, ymm_rx_8),
      83             :                                         _mm256_mul_ps(ymm_ry_8, ymm_ry_8));
      84       15551 :         __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */
      85       15551 :         __m256 ymm_invr2_8 = _mm256_rcp_ps(ymm_r2_8);
      86             :         ymm_nominator =
      87       31102 :             _mm256_add_ps(ymm_nominator, /* nominator += invr2 * pafZ[i] */
      88       15551 :                           _mm256_mul_ps(ymm_invr2, _mm256_load_ps(pafZ + i)));
      89       46653 :         ymm_nominator = _mm256_add_ps(
      90             :             ymm_nominator,
      91       15551 :             _mm256_mul_ps(ymm_invr2_8, _mm256_load_ps(pafZ + i + 8)));
      92       15551 :         ymm_denominator = _mm256_add_ps(ymm_denominator,
      93             :                                         ymm_invr2); /* denominator += invr2 */
      94       15551 :         ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2_8);
      95       15595 :         mask =
      96       15551 :             _mm256_movemask_ps(_mm256_cmp_ps(
      97             :                 ymm_r2, ymm_small, _CMP_LT_OS)) | /* if( r2 < fEpsilon) */
      98       15587 :             (_mm256_movemask_ps(_mm256_cmp_ps(ymm_r2_8, ymm_small, _CMP_LT_OS))
      99       15595 :              << 8);
     100       15595 :         if (mask)
     101        1200 :             break;
     102             :     }
     103             : #else
     104             : #define LOOP_SIZE 8
     105             :     size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
     106             :     for (i = 0; i < nPointsRound; i += LOOP_SIZE)
     107             :     {
     108             :         __m256 ymm_rx =
     109             :             _mm256_sub_ps(_mm256_load_ps(const_cast<float *>(pafX) + i),
     110             :                           ymm_x); /* rx = pafX[i] - fXPoint */
     111             :         __m256 ymm_ry =
     112             :             _mm256_sub_ps(_mm256_load_ps(const_cast<float *>(pafY) + i),
     113             :                           ymm_y); /* ry = pafY[i] - fYPoint */
     114             :         __m256 ymm_r2 = _mm256_add_ps(
     115             :             _mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */
     116             :             _mm256_mul_ps(ymm_ry, ymm_ry));
     117             :         __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */
     118             :         ymm_nominator = _mm256_add_ps(
     119             :             ymm_nominator, /* nominator += invr2 * pafZ[i] */
     120             :             _mm256_mul_ps(ymm_invr2,
     121             :                           _mm256_load_ps(const_cast<float *>(pafZ) + i)));
     122             :         ymm_denominator = _mm256_add_ps(ymm_denominator,
     123             :                                         ymm_invr2); /* denominator += invr2 */
     124             :         mask = _mm256_movemask_ps(_mm256_cmp_ps(
     125             :             ymm_r2, ymm_small, _CMP_LT_OS)); /* if( r2 < fEpsilon) */
     126             :         if (mask)
     127             :             break;
     128             :     }
     129             : #endif
     130             : 
     131             :     // Find which i triggered r2 < fEpsilon.
     132        1364 :     if (mask)
     133             :     {
     134       10082 :         for (int j = 0; j < LOOP_SIZE; j++)
     135             :         {
     136       10079 :             if (mask & (1 << j))
     137             :             {
     138        1196 :                 (*pdfValue) = (pafZ)[i + j];
     139             : 
     140             :                 // GCC and MSVC need explicit zeroing.
     141             : #if !defined(__clang__)
     142             :                 _mm256_zeroupper();
     143             : #endif
     144        1195 :                 return CE_None;
     145             :             }
     146             :         }
     147             :     }
     148             : #undef LOOP_SIZE
     149             : 
     150             :     // Get back nominator and denominator values for YMM registers.
     151             :     float afNominator[8];
     152             :     float afDenominator[8];
     153             :     _mm256_storeu_ps(afNominator, ymm_nominator);
     154             :     _mm256_storeu_ps(afDenominator, ymm_denominator);
     155             : 
     156             :     // MSVC doesn't emit AVX afterwards but may use SSE, so clear
     157             :     // upper bits.  Other compilers will continue using AVX for the
     158             :     // below floating points operations.
     159             : #if defined(_MSC_FULL_VER)
     160             :     _mm256_zeroupper();
     161             : #endif
     162             : 
     163         168 :     float fNominator = afNominator[0] + afNominator[1] + afNominator[2] +
     164         168 :                        afNominator[3] + afNominator[4] + afNominator[5] +
     165         168 :                        afNominator[6] + afNominator[7];
     166         168 :     float fDenominator = afDenominator[0] + afDenominator[1] +
     167         168 :                          afDenominator[2] + afDenominator[3] +
     168         168 :                          afDenominator[4] + afDenominator[5] +
     169         168 :                          afDenominator[6] + afDenominator[7];
     170             : 
     171             :     // Do the few remaining loop iterations.
     172         328 :     for (; i < nPoints; i++)
     173             :     {
     174         175 :         const float fRX = pafX[i] - fXPoint;
     175         175 :         const float fRY = pafY[i] - fYPoint;
     176         175 :         const float fR2 = fRX * fRX + fRY * fRY;
     177             : 
     178             :         // If the test point is close to the grid node, use the point
     179             :         // value directly as a node value to avoid singularity.
     180         175 :         if (fR2 < 0.0000000000001)
     181             :         {
     182          15 :             break;
     183             :         }
     184             :         else
     185             :         {
     186         160 :             const float fInvR2 = 1.0f / fR2;
     187         160 :             fNominator += fInvR2 * pafZ[i];
     188         160 :             fDenominator += fInvR2;
     189             :         }
     190             :     }
     191             : 
     192         168 :     if (i != nPoints)
     193             :     {
     194          15 :         (*pdfValue) = pafZ[i];
     195             :     }
     196         153 :     else if (fDenominator == 0.0)
     197             :     {
     198           0 :         (*pdfValue) =
     199             :             static_cast<const GDALGridInverseDistanceToAPowerOptions *>(
     200             :                 poOptions)
     201           0 :                 ->dfNoDataValue;
     202             :     }
     203             :     else
     204         153 :         (*pdfValue) = fNominator / fDenominator;
     205             : 
     206             :         // GCC needs explicit zeroing.
     207             : #if defined(__GNUC__) && !defined(__clang__)
     208             :     _mm256_zeroupper();
     209             : #endif
     210             : 
     211         125 :     return CE_None;
     212             : }
     213             : 
     214             : #endif /* HAVE_AVX_AT_COMPILE_TIME */

Generated by: LCOV version 1.14