LCOV - code coverage report
Current view: top level - alg - gdalgridsse.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 59 61 96.7 %
Date: 2024-05-04 12:52:34 Functions: 1 1 100.0 %

          Line data    Source code
       1             : /******************************************************************************
       2             :  *
       3             :  * Project:  GDAL Gridding API.
       4             :  * Purpose:  Implementation of GDAL scattered data gridder.
       5             :  * Author:   Even Rouault, <even dot rouault at spatialys.com>
       6             :  *
       7             :  ******************************************************************************
       8             :  * Copyright (c) 2013, Even Rouault <even dot rouault at spatialys.com>
       9             :  *
      10             :  * Permission is hereby granted, free of charge, to any person obtaining a
      11             :  * copy of this software and associated documentation files (the "Software"),
      12             :  * to deal in the Software without restriction, including without limitation
      13             :  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      14             :  * and/or sell copies of the Software, and to permit persons to whom the
      15             :  * Software is furnished to do so, subject to the following conditions:
      16             :  *
      17             :  * The above copyright notice and this permission notice shall be included
      18             :  * in all copies or substantial portions of the Software.
      19             :  *
      20             :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
      21             :  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
      22             :  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
      23             :  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
      24             :  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
      25             :  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
      26             :  * DEALINGS IN THE SOFTWARE.
      27             :  ****************************************************************************/
      28             : 
      29             : #include "gdalgrid.h"
      30             : #include "gdalgrid_priv.h"
      31             : 
      32             : #ifdef HAVE_SSE_AT_COMPILE_TIME
      33             : #include <xmmintrin.h>
      34             : 
      35             : /************************************************************************/
      36             : /*         GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE()     */
      37             : /************************************************************************/
      38             : 
      39         501 : CPLErr GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE(
      40             :     const void *poOptions, GUInt32 nPoints,
      41             :     CPL_UNUSED const double *unused_padfX,
      42             :     CPL_UNUSED const double *unused_padfY,
      43             :     CPL_UNUSED const double *unused_padfZ, double dfXPoint, double dfYPoint,
      44             :     double *pdfValue, void *hExtraParamsIn)
      45             : {
      46         501 :     size_t i = 0;
      47         501 :     GDALGridExtraParameters *psExtraParams =
      48             :         static_cast<GDALGridExtraParameters *>(hExtraParamsIn);
      49         501 :     const float *pafX = psExtraParams->pafX;
      50         501 :     const float *pafY = psExtraParams->pafY;
      51         501 :     const float *pafZ = psExtraParams->pafZ;
      52             : 
      53         501 :     const float fEpsilon = 0.0000000000001f;
      54         501 :     const float fXPoint = static_cast<float>(dfXPoint);
      55         501 :     const float fYPoint = static_cast<float>(dfYPoint);
      56         501 :     const __m128 xmm_small = _mm_load1_ps(const_cast<float *>(&fEpsilon));
      57         501 :     const __m128 xmm_x = _mm_load1_ps(const_cast<float *>(&fXPoint));
      58         501 :     const __m128 xmm_y = _mm_load1_ps(const_cast<float *>(&fYPoint));
      59         501 :     __m128 xmm_nominator = _mm_setzero_ps();
      60         501 :     __m128 xmm_denominator = _mm_setzero_ps();
      61         501 :     int mask = 0;
      62             : 
      63             : #if defined(__x86_64) || defined(_M_X64)
      64             :     // This would also work in 32bit mode, but there are only 8 XMM registers
      65             :     // whereas we have 16 for 64bit.
      66         501 :     const size_t LOOP_SIZE = 8;
      67         501 :     size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
      68       10301 :     for (i = 0; i < nPointsRound; i += LOOP_SIZE)
      69             :     {
      70             :         // rx = pafX[i] - fXPoint
      71       20400 :         __m128 xmm_rx = _mm_sub_ps(_mm_load_ps(pafX + i), xmm_x);
      72       20400 :         __m128 xmm_rx_4 = _mm_sub_ps(_mm_load_ps(pafX + i + 4), xmm_x);
      73             :         // ry = pafY[i] - fYPoint
      74       20400 :         __m128 xmm_ry = _mm_sub_ps(_mm_load_ps(pafY + i), xmm_y);
      75       30600 :         __m128 xmm_ry_4 = _mm_sub_ps(_mm_load_ps(pafY + i + 4), xmm_y);
      76             :         // r2 = rx * rx + ry * ry
      77             :         __m128 xmm_r2 =
      78       30600 :             _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx), _mm_mul_ps(xmm_ry, xmm_ry));
      79       30600 :         __m128 xmm_r2_4 = _mm_add_ps(_mm_mul_ps(xmm_rx_4, xmm_rx_4),
      80             :                                      _mm_mul_ps(xmm_ry_4, xmm_ry_4));
      81             :         // invr2 = 1.0f / r2
      82       10200 :         __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2);
      83       10200 :         __m128 xmm_invr2_4 = _mm_rcp_ps(xmm_r2_4);
      84             :         // nominator += invr2 * pafZ[i]
      85       20400 :         xmm_nominator = _mm_add_ps(
      86       10200 :             xmm_nominator, _mm_mul_ps(xmm_invr2, _mm_load_ps(pafZ + i)));
      87       30600 :         xmm_nominator = _mm_add_ps(
      88       10200 :             xmm_nominator, _mm_mul_ps(xmm_invr2_4, _mm_load_ps(pafZ + i + 4)));
      89             :         // denominator += invr2
      90       10200 :         xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2);
      91       10200 :         xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2_4);
      92             :         // if( r2 < fEpsilon)
      93       20400 :         mask = _mm_movemask_ps(_mm_cmplt_ps(xmm_r2, xmm_small)) |
      94       10200 :                (_mm_movemask_ps(_mm_cmplt_ps(xmm_r2_4, xmm_small)) << 4);
      95       10200 :         if (mask)
      96         400 :             break;
      97             :     }
      98             : #else
      99             : #define LOOP_SIZE 4
     100             :     size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
     101             :     for (i = 0; i < nPointsRound; i += LOOP_SIZE)
     102             :     {
     103             :         __m128 xmm_rx = _mm_sub_ps(_mm_load_ps(pafX + i),
     104             :                                    xmm_x); /* rx = pafX[i] - fXPoint */
     105             :         __m128 xmm_ry = _mm_sub_ps(_mm_load_ps(pafY + i),
     106             :                                    xmm_y); /* ry = pafY[i] - fYPoint */
     107             :         __m128 xmm_r2 =
     108             :             _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx), /* r2 = rx * rx + ry * ry */
     109             :                        _mm_mul_ps(xmm_ry, xmm_ry));
     110             :         __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2); /* invr2 = 1.0f / r2 */
     111             :         xmm_nominator =
     112             :             _mm_add_ps(xmm_nominator, /* nominator += invr2 * pafZ[i] */
     113             :                        _mm_mul_ps(xmm_invr2, _mm_load_ps(pafZ + i)));
     114             :         xmm_denominator =
     115             :             _mm_add_ps(xmm_denominator, xmm_invr2); /* denominator += invr2 */
     116             :         mask = _mm_movemask_ps(
     117             :             _mm_cmplt_ps(xmm_r2, xmm_small)); /* if( r2 < fEpsilon) */
     118             :         if (mask)
     119             :             break;
     120             :     }
     121             : #endif
     122             : 
     123             :     // Find which i triggered r2 < fEpsilon.
     124         501 :     if (mask)
     125             :     {
     126        1800 :         for (size_t j = 0; j < LOOP_SIZE; j++)
     127             :         {
     128        1800 :             if (mask & (1 << j))
     129             :             {
     130         400 :                 (*pdfValue) = (pafZ)[i + j];
     131         400 :                 return CE_None;
     132             :             }
     133             :         }
     134             :     }
     135             : 
     136             :     // Get back nominator and denominator values for XMM registers.
     137             :     float afNominator[4];
     138             :     float afDenominator[4];
     139             :     _mm_storeu_ps(afNominator, xmm_nominator);
     140             :     _mm_storeu_ps(afDenominator, xmm_denominator);
     141             : 
     142         101 :     float fNominator =
     143         101 :         afNominator[0] + afNominator[1] + afNominator[2] + afNominator[3];
     144         101 :     float fDenominator = afDenominator[0] + afDenominator[1] +
     145         101 :                          afDenominator[2] + afDenominator[3];
     146             : 
     147             :     /* Do the few remaining loop iterations */
     148         201 :     for (; i < nPoints; i++)
     149             :     {
     150         101 :         const float fRX = pafX[i] - fXPoint;
     151         101 :         const float fRY = pafY[i] - fYPoint;
     152         101 :         const float fR2 = fRX * fRX + fRY * fRY;
     153             : 
     154             :         // If the test point is close to the grid node, use the point
     155             :         // value directly as a node value to avoid singularity.
     156         101 :         if (fR2 < 0.0000000000001)
     157             :         {
     158           1 :             break;
     159             :         }
     160             :         else
     161             :         {
     162         100 :             const float fInvR2 = 1.0f / fR2;
     163         100 :             fNominator += fInvR2 * pafZ[i];
     164         100 :             fDenominator += fInvR2;
     165             :         }
     166             :     }
     167             : 
     168         101 :     if (i != nPoints)
     169             :     {
     170           1 :         (*pdfValue) = pafZ[i];
     171             :     }
     172         100 :     else if (fDenominator == 0.0)
     173             :     {
     174           0 :         (*pdfValue) =
     175             :             static_cast<const GDALGridInverseDistanceToAPowerOptions *>(
     176             :                 poOptions)
     177           0 :                 ->dfNoDataValue;
     178             :     }
     179             :     else
     180             :     {
     181         100 :         (*pdfValue) = fNominator / fDenominator;
     182             :     }
     183             : 
     184         101 :     return CE_None;
     185             : }
     186             : 
     187             : #endif /* HAVE_SSE_AT_COMPILE_TIME */

Generated by: LCOV version 1.14