Line data Source code
1 : /****************************************************************************** 2 : * 3 : * Project: GDAL Gridding API. 4 : * Purpose: Implementation of GDAL scattered data gridder. 5 : * Author: Even Rouault, <even dot rouault at spatialys.com> 6 : * 7 : ****************************************************************************** 8 : * Copyright (c) 2013, Even Rouault <even dot rouault at spatialys.com> 9 : * 10 : * SPDX-License-Identifier: MIT 11 : ****************************************************************************/ 12 : 13 : #include "gdalgrid.h" 14 : #include "gdalgrid_priv.h" 15 : 16 : #ifdef HAVE_SSE_AT_COMPILE_TIME 17 : 18 : #ifdef USE_NEON_OPTIMIZATIONS 19 : #include "include_sse2neon.h" 20 : #else 21 : #include <xmmintrin.h> 22 : #endif 23 : 24 : /************************************************************************/ 25 : /* GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE() */ 26 : /************************************************************************/ 27 : 28 494 : CPLErr GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE( 29 : const void *poOptions, GUInt32 nPoints, 30 : CPL_UNUSED const double *unused_padfX, 31 : CPL_UNUSED const double *unused_padfY, 32 : CPL_UNUSED const double *unused_padfZ, double dfXPoint, double dfYPoint, 33 : double *pdfValue, void *hExtraParamsIn) 34 : { 35 494 : size_t i = 0; 36 494 : GDALGridExtraParameters *psExtraParams = 37 : static_cast<GDALGridExtraParameters *>(hExtraParamsIn); 38 494 : const float *pafX = psExtraParams->pafX; 39 494 : const float *pafY = psExtraParams->pafY; 40 494 : const float *pafZ = psExtraParams->pafZ; 41 : 42 494 : const float fEpsilon = 0.0000000000001f; 43 494 : const float fXPoint = static_cast<float>(dfXPoint); 44 494 : const float fYPoint = static_cast<float>(dfYPoint); 45 494 : const __m128 xmm_small = _mm_load1_ps(const_cast<float *>(&fEpsilon)); 46 494 : const __m128 xmm_x = _mm_load1_ps(const_cast<float *>(&fXPoint)); 47 494 : const __m128 xmm_y = _mm_load1_ps(const_cast<float *>(&fYPoint)); 48 494 : __m128 xmm_nominator = _mm_setzero_ps(); 49 494 : __m128 xmm_denominator = _mm_setzero_ps(); 50 494 : int mask = 0; 51 : 52 : #if defined(__x86_64) || defined(_M_X64) || defined(USE_NEON_OPTIMIZATIONS) 53 : // This would also work in 32bit mode, but there are only 8 XMM registers 54 : // whereas we have 16 for 64bit. 55 494 : const size_t LOOP_SIZE = 8; 56 494 : size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE; 57 9548 : for (i = 0; i < nPointsRound; i += LOOP_SIZE) 58 : { 59 : // rx = pafX[i] - fXPoint 60 18906 : __m128 xmm_rx = _mm_sub_ps(_mm_load_ps(pafX + i), xmm_x); 61 18906 : __m128 xmm_rx_4 = _mm_sub_ps(_mm_load_ps(pafX + i + 4), xmm_x); 62 : // ry = pafY[i] - fYPoint 63 18906 : __m128 xmm_ry = _mm_sub_ps(_mm_load_ps(pafY + i), xmm_y); 64 28359 : __m128 xmm_ry_4 = _mm_sub_ps(_mm_load_ps(pafY + i + 4), xmm_y); 65 : // r2 = rx * rx + ry * ry 66 : __m128 xmm_r2 = 67 28359 : _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx), _mm_mul_ps(xmm_ry, xmm_ry)); 68 28359 : __m128 xmm_r2_4 = _mm_add_ps(_mm_mul_ps(xmm_rx_4, xmm_rx_4), 69 : _mm_mul_ps(xmm_ry_4, xmm_ry_4)); 70 : // invr2 = 1.0f / r2 71 9453 : __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2); 72 9453 : __m128 xmm_invr2_4 = _mm_rcp_ps(xmm_r2_4); 73 : // nominator += invr2 * pafZ[i] 74 18906 : xmm_nominator = _mm_add_ps( 75 9453 : xmm_nominator, _mm_mul_ps(xmm_invr2, _mm_load_ps(pafZ + i))); 76 28359 : xmm_nominator = _mm_add_ps( 77 9453 : xmm_nominator, _mm_mul_ps(xmm_invr2_4, _mm_load_ps(pafZ + i + 4))); 78 : // denominator += invr2 79 9453 : xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2); 80 9453 : xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2_4); 81 : // if( r2 < fEpsilon) 82 18906 : mask = _mm_movemask_ps(_mm_cmplt_ps(xmm_r2, xmm_small)) | 83 9453 : (_mm_movemask_ps(_mm_cmplt_ps(xmm_r2_4, xmm_small)) << 4); 84 9453 : if (mask) 85 399 : break; 86 : } 87 : #else 88 : #define LOOP_SIZE 4 89 : size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE; 90 : for (i = 0; i < nPointsRound; i += LOOP_SIZE) 91 : { 92 : __m128 xmm_rx = _mm_sub_ps(_mm_load_ps(pafX + i), 93 : xmm_x); /* rx = pafX[i] - fXPoint */ 94 : __m128 xmm_ry = _mm_sub_ps(_mm_load_ps(pafY + i), 95 : xmm_y); /* ry = pafY[i] - fYPoint */ 96 : __m128 xmm_r2 = 97 : _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx), /* r2 = rx * rx + ry * ry */ 98 : _mm_mul_ps(xmm_ry, xmm_ry)); 99 : __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2); /* invr2 = 1.0f / r2 */ 100 : xmm_nominator = 101 : _mm_add_ps(xmm_nominator, /* nominator += invr2 * pafZ[i] */ 102 : _mm_mul_ps(xmm_invr2, _mm_load_ps(pafZ + i))); 103 : xmm_denominator = 104 : _mm_add_ps(xmm_denominator, xmm_invr2); /* denominator += invr2 */ 105 : mask = _mm_movemask_ps( 106 : _mm_cmplt_ps(xmm_r2, xmm_small)); /* if( r2 < fEpsilon) */ 107 : if (mask) 108 : break; 109 : } 110 : #endif 111 : 112 : // Find which i triggered r2 < fEpsilon. 113 494 : if (mask) 114 : { 115 1790 : for (size_t j = 0; j < LOOP_SIZE; j++) 116 : { 117 1789 : if (mask & (1 << j)) 118 : { 119 398 : (*pdfValue) = (pafZ)[i + j]; 120 398 : return CE_None; 121 : } 122 : } 123 : } 124 : 125 : // Get back nominator and denominator values for XMM registers. 126 : float afNominator[4]; 127 : float afDenominator[4]; 128 : _mm_storeu_ps(afNominator, xmm_nominator); 129 : _mm_storeu_ps(afDenominator, xmm_denominator); 130 : 131 96 : float fNominator = 132 96 : afNominator[0] + afNominator[1] + afNominator[2] + afNominator[3]; 133 96 : float fDenominator = afDenominator[0] + afDenominator[1] + 134 96 : afDenominator[2] + afDenominator[3]; 135 : 136 : /* Do the few remaining loop iterations */ 137 195 : for (; i < nPoints; i++) 138 : { 139 100 : const float fRX = pafX[i] - fXPoint; 140 100 : const float fRY = pafY[i] - fYPoint; 141 100 : const float fR2 = fRX * fRX + fRY * fRY; 142 : 143 : // If the test point is close to the grid node, use the point 144 : // value directly as a node value to avoid singularity. 145 100 : if (fR2 < 0.0000000000001) 146 : { 147 1 : break; 148 : } 149 : else 150 : { 151 99 : const float fInvR2 = 1.0f / fR2; 152 99 : fNominator += fInvR2 * pafZ[i]; 153 99 : fDenominator += fInvR2; 154 : } 155 : } 156 : 157 96 : if (i != nPoints) 158 : { 159 1 : (*pdfValue) = pafZ[i]; 160 : } 161 95 : else if (fDenominator == 0.0) 162 : { 163 0 : (*pdfValue) = 164 : static_cast<const GDALGridInverseDistanceToAPowerOptions *>( 165 : poOptions) 166 0 : ->dfNoDataValue; 167 : } 168 : else 169 : { 170 95 : (*pdfValue) = fNominator / fDenominator; 171 : } 172 : 173 96 : return CE_None; 174 : } 175 : 176 : #endif /* HAVE_SSE_AT_COMPILE_TIME */