Line data Source code
1 : /****************************************************************************** 2 : * 3 : * Project: GDAL Gridding API. 4 : * Purpose: Implementation of GDAL scattered data gridder. 5 : * Author: Even Rouault, <even dot rouault at spatialys.com> 6 : * 7 : ****************************************************************************** 8 : * Copyright (c) 2013, Even Rouault <even dot rouault at spatialys.com> 9 : * 10 : * SPDX-License-Identifier: MIT 11 : ****************************************************************************/ 12 : 13 : #include "gdalgrid.h" 14 : #include "gdalgrid_priv.h" 15 : 16 : #ifdef HAVE_SSE_AT_COMPILE_TIME 17 : #include <xmmintrin.h> 18 : 19 : /************************************************************************/ 20 : /* GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE() */ 21 : /************************************************************************/ 22 : 23 499 : CPLErr GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE( 24 : const void *poOptions, GUInt32 nPoints, 25 : CPL_UNUSED const double *unused_padfX, 26 : CPL_UNUSED const double *unused_padfY, 27 : CPL_UNUSED const double *unused_padfZ, double dfXPoint, double dfYPoint, 28 : double *pdfValue, void *hExtraParamsIn) 29 : { 30 499 : size_t i = 0; 31 499 : GDALGridExtraParameters *psExtraParams = 32 : static_cast<GDALGridExtraParameters *>(hExtraParamsIn); 33 499 : const float *pafX = psExtraParams->pafX; 34 499 : const float *pafY = psExtraParams->pafY; 35 499 : const float *pafZ = psExtraParams->pafZ; 36 : 37 499 : const float fEpsilon = 0.0000000000001f; 38 499 : const float fXPoint = static_cast<float>(dfXPoint); 39 499 : const float fYPoint = static_cast<float>(dfYPoint); 40 499 : const __m128 xmm_small = _mm_load1_ps(const_cast<float *>(&fEpsilon)); 41 499 : const __m128 xmm_x = _mm_load1_ps(const_cast<float *>(&fXPoint)); 42 499 : const __m128 xmm_y = _mm_load1_ps(const_cast<float *>(&fYPoint)); 43 499 : __m128 xmm_nominator = _mm_setzero_ps(); 44 499 : __m128 xmm_denominator = _mm_setzero_ps(); 45 499 : int mask = 0; 46 : 47 : #if defined(__x86_64) || defined(_M_X64) 48 : // This would also work in 32bit mode, but there are only 8 XMM registers 49 : // whereas we have 16 for 64bit. 50 499 : const size_t LOOP_SIZE = 8; 51 499 : size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE; 52 9035 : for (i = 0; i < nPointsRound; i += LOOP_SIZE) 53 : { 54 : // rx = pafX[i] - fXPoint 55 17872 : __m128 xmm_rx = _mm_sub_ps(_mm_load_ps(pafX + i), xmm_x); 56 17872 : __m128 xmm_rx_4 = _mm_sub_ps(_mm_load_ps(pafX + i + 4), xmm_x); 57 : // ry = pafY[i] - fYPoint 58 17872 : __m128 xmm_ry = _mm_sub_ps(_mm_load_ps(pafY + i), xmm_y); 59 26808 : __m128 xmm_ry_4 = _mm_sub_ps(_mm_load_ps(pafY + i + 4), xmm_y); 60 : // r2 = rx * rx + ry * ry 61 : __m128 xmm_r2 = 62 26808 : _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx), _mm_mul_ps(xmm_ry, xmm_ry)); 63 26808 : __m128 xmm_r2_4 = _mm_add_ps(_mm_mul_ps(xmm_rx_4, xmm_rx_4), 64 : _mm_mul_ps(xmm_ry_4, xmm_ry_4)); 65 : // invr2 = 1.0f / r2 66 8936 : __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2); 67 8936 : __m128 xmm_invr2_4 = _mm_rcp_ps(xmm_r2_4); 68 : // nominator += invr2 * pafZ[i] 69 17872 : xmm_nominator = _mm_add_ps( 70 8936 : xmm_nominator, _mm_mul_ps(xmm_invr2, _mm_load_ps(pafZ + i))); 71 26808 : xmm_nominator = _mm_add_ps( 72 8936 : xmm_nominator, _mm_mul_ps(xmm_invr2_4, _mm_load_ps(pafZ + i + 4))); 73 : // denominator += invr2 74 8936 : xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2); 75 8936 : xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2_4); 76 : // if( r2 < fEpsilon) 77 17872 : mask = _mm_movemask_ps(_mm_cmplt_ps(xmm_r2, xmm_small)) | 78 8936 : (_mm_movemask_ps(_mm_cmplt_ps(xmm_r2_4, xmm_small)) << 4); 79 8936 : if (mask) 80 400 : break; 81 : } 82 : #else 83 : #define LOOP_SIZE 4 84 : size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE; 85 : for (i = 0; i < nPointsRound; i += LOOP_SIZE) 86 : { 87 : __m128 xmm_rx = _mm_sub_ps(_mm_load_ps(pafX + i), 88 : xmm_x); /* rx = pafX[i] - fXPoint */ 89 : __m128 xmm_ry = _mm_sub_ps(_mm_load_ps(pafY + i), 90 : xmm_y); /* ry = pafY[i] - fYPoint */ 91 : __m128 xmm_r2 = 92 : _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx), /* r2 = rx * rx + ry * ry */ 93 : _mm_mul_ps(xmm_ry, xmm_ry)); 94 : __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2); /* invr2 = 1.0f / r2 */ 95 : xmm_nominator = 96 : _mm_add_ps(xmm_nominator, /* nominator += invr2 * pafZ[i] */ 97 : _mm_mul_ps(xmm_invr2, _mm_load_ps(pafZ + i))); 98 : xmm_denominator = 99 : _mm_add_ps(xmm_denominator, xmm_invr2); /* denominator += invr2 */ 100 : mask = _mm_movemask_ps( 101 : _mm_cmplt_ps(xmm_r2, xmm_small)); /* if( r2 < fEpsilon) */ 102 : if (mask) 103 : break; 104 : } 105 : #endif 106 : 107 : // Find which i triggered r2 < fEpsilon. 108 499 : if (mask) 109 : { 110 1796 : for (size_t j = 0; j < LOOP_SIZE; j++) 111 : { 112 1796 : if (mask & (1 << j)) 113 : { 114 400 : (*pdfValue) = (pafZ)[i + j]; 115 400 : return CE_None; 116 : } 117 : } 118 : } 119 : 120 : // Get back nominator and denominator values for XMM registers. 121 : float afNominator[4]; 122 : float afDenominator[4]; 123 : _mm_storeu_ps(afNominator, xmm_nominator); 124 : _mm_storeu_ps(afDenominator, xmm_denominator); 125 : 126 99 : float fNominator = 127 99 : afNominator[0] + afNominator[1] + afNominator[2] + afNominator[3]; 128 99 : float fDenominator = afDenominator[0] + afDenominator[1] + 129 99 : afDenominator[2] + afDenominator[3]; 130 : 131 : /* Do the few remaining loop iterations */ 132 199 : for (; i < nPoints; i++) 133 : { 134 101 : const float fRX = pafX[i] - fXPoint; 135 101 : const float fRY = pafY[i] - fYPoint; 136 101 : const float fR2 = fRX * fRX + fRY * fRY; 137 : 138 : // If the test point is close to the grid node, use the point 139 : // value directly as a node value to avoid singularity. 140 101 : if (fR2 < 0.0000000000001) 141 : { 142 1 : break; 143 : } 144 : else 145 : { 146 100 : const float fInvR2 = 1.0f / fR2; 147 100 : fNominator += fInvR2 * pafZ[i]; 148 100 : fDenominator += fInvR2; 149 : } 150 : } 151 : 152 99 : if (i != nPoints) 153 : { 154 1 : (*pdfValue) = pafZ[i]; 155 : } 156 98 : else if (fDenominator == 0.0) 157 : { 158 0 : (*pdfValue) = 159 : static_cast<const GDALGridInverseDistanceToAPowerOptions *>( 160 : poOptions) 161 0 : ->dfNoDataValue; 162 : } 163 : else 164 : { 165 98 : (*pdfValue) = fNominator / fDenominator; 166 : } 167 : 168 99 : return CE_None; 169 : } 170 : 171 : #endif /* HAVE_SSE_AT_COMPILE_TIME */