Line data Source code
1 : /****************************************************************************** 2 : * 3 : * Project: GDAL Gridding API. 4 : * Purpose: Implementation of GDAL scattered data gridder. 5 : * Author: Even Rouault, <even dot rouault at spatialys.com> 6 : * 7 : ****************************************************************************** 8 : * Copyright (c) 2013, Even Rouault <even dot rouault at spatialys.com> 9 : * 10 : * SPDX-License-Identifier: MIT 11 : ****************************************************************************/ 12 : 13 : #include "gdalgrid.h" 14 : #include "gdalgrid_priv.h" 15 : 16 : #ifdef HAVE_SSE_AT_COMPILE_TIME 17 : 18 : #ifdef USE_NEON_OPTIMIZATIONS 19 : #include "include_sse2neon.h" 20 : #else 21 : #include <xmmintrin.h> 22 : #endif 23 : 24 : /************************************************************************/ 25 : /* GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE() */ 26 : /************************************************************************/ 27 : 28 501 : CPLErr GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE( 29 : const void *poOptions, GUInt32 nPoints, 30 : CPL_UNUSED const double *unused_padfX, 31 : CPL_UNUSED const double *unused_padfY, 32 : CPL_UNUSED const double *unused_padfZ, double dfXPoint, double dfYPoint, 33 : double *pdfValue, void *hExtraParamsIn) 34 : { 35 501 : size_t i = 0; 36 501 : GDALGridExtraParameters *psExtraParams = 37 : static_cast<GDALGridExtraParameters *>(hExtraParamsIn); 38 501 : const float *pafX = psExtraParams->pafX; 39 501 : const float *pafY = psExtraParams->pafY; 40 501 : const float *pafZ = psExtraParams->pafZ; 41 : 42 501 : const float fEpsilon = 0.0000000000001f; 43 501 : const float fXPoint = static_cast<float>(dfXPoint); 44 501 : const float fYPoint = static_cast<float>(dfYPoint); 45 501 : const __m128 xmm_small = _mm_load1_ps(const_cast<float *>(&fEpsilon)); 46 501 : const __m128 xmm_x = _mm_load1_ps(const_cast<float *>(&fXPoint)); 47 501 : const __m128 xmm_y = _mm_load1_ps(const_cast<float *>(&fYPoint)); 48 501 : __m128 xmm_nominator = _mm_setzero_ps(); 49 501 : __m128 xmm_denominator = _mm_setzero_ps(); 50 501 : int mask = 0; 51 : 52 : #if defined(__x86_64) || defined(_M_X64) || defined(USE_NEON_OPTIMIZATIONS) 53 : // This would also work in 32bit mode, but there are only 8 XMM registers 54 : // whereas we have 16 for 64bit. 55 501 : const size_t LOOP_SIZE = 8; 56 501 : size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE; 57 10301 : for (i = 0; i < nPointsRound; i += LOOP_SIZE) 58 : { 59 : // rx = pafX[i] - fXPoint 60 20400 : __m128 xmm_rx = _mm_sub_ps(_mm_load_ps(pafX + i), xmm_x); 61 20400 : __m128 xmm_rx_4 = _mm_sub_ps(_mm_load_ps(pafX + i + 4), xmm_x); 62 : // ry = pafY[i] - fYPoint 63 20400 : __m128 xmm_ry = _mm_sub_ps(_mm_load_ps(pafY + i), xmm_y); 64 30600 : __m128 xmm_ry_4 = _mm_sub_ps(_mm_load_ps(pafY + i + 4), xmm_y); 65 : // r2 = rx * rx + ry * ry 66 : __m128 xmm_r2 = 67 30600 : _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx), _mm_mul_ps(xmm_ry, xmm_ry)); 68 30600 : __m128 xmm_r2_4 = _mm_add_ps(_mm_mul_ps(xmm_rx_4, xmm_rx_4), 69 : _mm_mul_ps(xmm_ry_4, xmm_ry_4)); 70 : // invr2 = 1.0f / r2 71 10200 : __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2); 72 10200 : __m128 xmm_invr2_4 = _mm_rcp_ps(xmm_r2_4); 73 : // nominator += invr2 * pafZ[i] 74 20400 : xmm_nominator = _mm_add_ps( 75 10200 : xmm_nominator, _mm_mul_ps(xmm_invr2, _mm_load_ps(pafZ + i))); 76 30600 : xmm_nominator = _mm_add_ps( 77 10200 : xmm_nominator, _mm_mul_ps(xmm_invr2_4, _mm_load_ps(pafZ + i + 4))); 78 : // denominator += invr2 79 10200 : xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2); 80 10200 : xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2_4); 81 : // if( r2 < fEpsilon) 82 20400 : mask = _mm_movemask_ps(_mm_cmplt_ps(xmm_r2, xmm_small)) | 83 10200 : (_mm_movemask_ps(_mm_cmplt_ps(xmm_r2_4, xmm_small)) << 4); 84 10200 : if (mask) 85 400 : break; 86 : } 87 : #else 88 : #define LOOP_SIZE 4 89 : size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE; 90 : for (i = 0; i < nPointsRound; i += LOOP_SIZE) 91 : { 92 : __m128 xmm_rx = _mm_sub_ps(_mm_load_ps(pafX + i), 93 : xmm_x); /* rx = pafX[i] - fXPoint */ 94 : __m128 xmm_ry = _mm_sub_ps(_mm_load_ps(pafY + i), 95 : xmm_y); /* ry = pafY[i] - fYPoint */ 96 : __m128 xmm_r2 = 97 : _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx), /* r2 = rx * rx + ry * ry */ 98 : _mm_mul_ps(xmm_ry, xmm_ry)); 99 : __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2); /* invr2 = 1.0f / r2 */ 100 : xmm_nominator = 101 : _mm_add_ps(xmm_nominator, /* nominator += invr2 * pafZ[i] */ 102 : _mm_mul_ps(xmm_invr2, _mm_load_ps(pafZ + i))); 103 : xmm_denominator = 104 : _mm_add_ps(xmm_denominator, xmm_invr2); /* denominator += invr2 */ 105 : mask = _mm_movemask_ps( 106 : _mm_cmplt_ps(xmm_r2, xmm_small)); /* if( r2 < fEpsilon) */ 107 : if (mask) 108 : break; 109 : } 110 : #endif 111 : 112 : // Find which i triggered r2 < fEpsilon. 113 501 : if (mask) 114 : { 115 1800 : for (size_t j = 0; j < LOOP_SIZE; j++) 116 : { 117 1800 : if (mask & (1 << j)) 118 : { 119 400 : (*pdfValue) = (pafZ)[i + j]; 120 400 : return CE_None; 121 : } 122 : } 123 : } 124 : 125 : // Get back nominator and denominator values for XMM registers. 126 : float afNominator[4]; 127 : float afDenominator[4]; 128 : _mm_storeu_ps(afNominator, xmm_nominator); 129 : _mm_storeu_ps(afDenominator, xmm_denominator); 130 : 131 101 : float fNominator = 132 101 : afNominator[0] + afNominator[1] + afNominator[2] + afNominator[3]; 133 101 : float fDenominator = afDenominator[0] + afDenominator[1] + 134 101 : afDenominator[2] + afDenominator[3]; 135 : 136 : /* Do the few remaining loop iterations */ 137 201 : for (; i < nPoints; i++) 138 : { 139 101 : const float fRX = pafX[i] - fXPoint; 140 101 : const float fRY = pafY[i] - fYPoint; 141 101 : const float fR2 = fRX * fRX + fRY * fRY; 142 : 143 : // If the test point is close to the grid node, use the point 144 : // value directly as a node value to avoid singularity. 145 101 : if (fR2 < 0.0000000000001) 146 : { 147 1 : break; 148 : } 149 : else 150 : { 151 100 : const float fInvR2 = 1.0f / fR2; 152 100 : fNominator += fInvR2 * pafZ[i]; 153 100 : fDenominator += fInvR2; 154 : } 155 : } 156 : 157 101 : if (i != nPoints) 158 : { 159 1 : (*pdfValue) = pafZ[i]; 160 : } 161 100 : else if (fDenominator == 0.0) 162 : { 163 0 : (*pdfValue) = 164 : static_cast<const GDALGridInverseDistanceToAPowerOptions *>( 165 : poOptions) 166 0 : ->dfNoDataValue; 167 : } 168 : else 169 : { 170 100 : (*pdfValue) = fNominator / fDenominator; 171 : } 172 : 173 101 : return CE_None; 174 : } 175 : 176 : #endif /* HAVE_SSE_AT_COMPILE_TIME */