Line data Source code
1 : /****************************************************************************** 2 : * 3 : * Project: GDAL Gridding API. 4 : * Purpose: Implementation of GDAL scattered data gridder. 5 : * Author: Even Rouault, <even dot rouault at spatialys.com> 6 : * 7 : ****************************************************************************** 8 : * Copyright (c) 2013, Even Rouault <even dot rouault at spatialys.com> 9 : * 10 : * SPDX-License-Identifier: MIT 11 : ****************************************************************************/ 12 : 13 : #include "gdalgrid.h" 14 : #include "gdalgrid_priv.h" 15 : 16 : #ifdef HAVE_AVX_AT_COMPILE_TIME 17 : #include <immintrin.h> 18 : 19 : /************************************************************************/ 20 : /* GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX() */ 21 : /************************************************************************/ 22 : 23 1221500 : CPLErr GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX( 24 : const void *poOptions, GUInt32 nPoints, 25 : CPL_UNUSED const double *unused_padfX, 26 : CPL_UNUSED const double *unused_padfY, 27 : CPL_UNUSED const double *unused_padfZ, double dfXPoint, double dfYPoint, 28 : double *pdfValue, void *hExtraParamsIn) 29 : { 30 1221500 : size_t i = 0; 31 1221500 : GDALGridExtraParameters *psExtraParams = 32 : static_cast<GDALGridExtraParameters *>(hExtraParamsIn); 33 1221500 : const float *pafX = psExtraParams->pafX; 34 1221500 : const float *pafY = psExtraParams->pafY; 35 1221500 : const float *pafZ = psExtraParams->pafZ; 36 : 37 1221500 : const float fEpsilon = 0.0000000000001f; 38 1221500 : const float fXPoint = static_cast<float>(dfXPoint); 39 1221500 : const float fYPoint = static_cast<float>(dfYPoint); 40 1221500 : const __m256 ymm_small = _mm256_set1_ps(fEpsilon); 41 1221500 : const __m256 ymm_x = _mm256_set1_ps(fXPoint); 42 1221500 : const __m256 ymm_y = _mm256_set1_ps(fYPoint); 43 1221500 : __m256 ymm_nominator = _mm256_setzero_ps(); 44 1221500 : __m256 ymm_denominator = _mm256_setzero_ps(); 45 1221500 : int mask = 0; 46 : 47 : #undef LOOP_SIZE 48 : #if defined(__x86_64) || defined(_M_X64) 49 : /* This would also work in 32bit mode, but there are only 8 XMM registers */ 50 : /* whereas we have 16 for 64bit */ 51 : #define LOOP_SIZE 16 52 1221500 : size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE; 53 1235570 : for (i = 0; i < nPointsRound; i += LOOP_SIZE) 54 : { 55 30568 : __m256 ymm_rx = _mm256_sub_ps(_mm256_load_ps(pafX + i), 56 : ymm_x); /* rx = pafX[i] - fXPoint */ 57 30568 : __m256 ymm_rx_8 = _mm256_sub_ps(_mm256_load_ps(pafX + i + 8), ymm_x); 58 30568 : __m256 ymm_ry = _mm256_sub_ps(_mm256_load_ps(pafY + i), 59 : ymm_y); /* ry = pafY[i] - fYPoint */ 60 45852 : __m256 ymm_ry_8 = _mm256_sub_ps(_mm256_load_ps(pafY + i + 8), ymm_y); 61 45852 : __m256 ymm_r2 = _mm256_add_ps( 62 : _mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */ 63 : _mm256_mul_ps(ymm_ry, ymm_ry)); 64 45852 : __m256 ymm_r2_8 = _mm256_add_ps(_mm256_mul_ps(ymm_rx_8, ymm_rx_8), 65 : _mm256_mul_ps(ymm_ry_8, ymm_ry_8)); 66 15284 : __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */ 67 15284 : __m256 ymm_invr2_8 = _mm256_rcp_ps(ymm_r2_8); 68 : ymm_nominator = 69 30568 : _mm256_add_ps(ymm_nominator, /* nominator += invr2 * pafZ[i] */ 70 15284 : _mm256_mul_ps(ymm_invr2, _mm256_load_ps(pafZ + i))); 71 45852 : ymm_nominator = _mm256_add_ps( 72 : ymm_nominator, 73 15284 : _mm256_mul_ps(ymm_invr2_8, _mm256_load_ps(pafZ + i + 8))); 74 15284 : ymm_denominator = _mm256_add_ps(ymm_denominator, 75 : ymm_invr2); /* denominator += invr2 */ 76 15284 : ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2_8); 77 15271 : mask = 78 15284 : _mm256_movemask_ps(_mm256_cmp_ps( 79 : ymm_r2, ymm_small, _CMP_LT_OS)) | /* if( r2 < fEpsilon) */ 80 15282 : (_mm256_movemask_ps(_mm256_cmp_ps(ymm_r2_8, ymm_small, _CMP_LT_OS)) 81 15271 : << 8); 82 15271 : if (mask) 83 1199 : break; 84 : } 85 : #else 86 : #define LOOP_SIZE 8 87 : size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE; 88 : for (i = 0; i < nPointsRound; i += LOOP_SIZE) 89 : { 90 : __m256 ymm_rx = 91 : _mm256_sub_ps(_mm256_load_ps(const_cast<float *>(pafX) + i), 92 : ymm_x); /* rx = pafX[i] - fXPoint */ 93 : __m256 ymm_ry = 94 : _mm256_sub_ps(_mm256_load_ps(const_cast<float *>(pafY) + i), 95 : ymm_y); /* ry = pafY[i] - fYPoint */ 96 : __m256 ymm_r2 = _mm256_add_ps( 97 : _mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */ 98 : _mm256_mul_ps(ymm_ry, ymm_ry)); 99 : __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */ 100 : ymm_nominator = _mm256_add_ps( 101 : ymm_nominator, /* nominator += invr2 * pafZ[i] */ 102 : _mm256_mul_ps(ymm_invr2, 103 : _mm256_load_ps(const_cast<float *>(pafZ) + i))); 104 : ymm_denominator = _mm256_add_ps(ymm_denominator, 105 : ymm_invr2); /* denominator += invr2 */ 106 : mask = _mm256_movemask_ps(_mm256_cmp_ps( 107 : ymm_r2, ymm_small, _CMP_LT_OS)); /* if( r2 < fEpsilon) */ 108 : if (mask) 109 : break; 110 : } 111 : #endif 112 : 113 : // Find which i triggered r2 < fEpsilon. 114 1221490 : if (mask) 115 : { 116 10151 : for (int j = 0; j < LOOP_SIZE; j++) 117 : { 118 10148 : if (mask & (1 << j)) 119 : { 120 1196 : (*pdfValue) = (pafZ)[i + j]; 121 : 122 1196 : return CE_None; 123 : } 124 : } 125 : } 126 : #undef LOOP_SIZE 127 : 128 : // Get back nominator and denominator values for YMM registers. 129 : float afNominator[8]; 130 : float afDenominator[8]; 131 : _mm256_storeu_ps(afNominator, ymm_nominator); 132 : _mm256_storeu_ps(afDenominator, ymm_denominator); 133 : 134 : // MSVC doesn't emit AVX afterwards but may use SSE, so clear 135 : // upper bits. Other compilers will continue using AVX for the 136 : // below floating points operations. 137 : #if defined(_MSC_FULL_VER) 138 : _mm256_zeroupper(); 139 : #endif 140 : 141 1220290 : float fNominator = afNominator[0] + afNominator[1] + afNominator[2] + 142 1220290 : afNominator[3] + afNominator[4] + afNominator[5] + 143 1220290 : afNominator[6] + afNominator[7]; 144 1220290 : float fDenominator = afDenominator[0] + afDenominator[1] + 145 1220290 : afDenominator[2] + afDenominator[3] + 146 1220290 : afDenominator[4] + afDenominator[5] + 147 1220290 : afDenominator[6] + afDenominator[7]; 148 : 149 : // Do the few remaining loop iterations. 150 6703850 : for (; i < nPoints; i++) 151 : { 152 5483580 : const float fRX = pafX[i] - fXPoint; 153 5483580 : const float fRY = pafY[i] - fYPoint; 154 5483580 : const float fR2 = fRX * fRX + fRY * fRY; 155 : 156 : // If the test point is close to the grid node, use the point 157 : // value directly as a node value to avoid singularity. 158 5483580 : if (fR2 < 0.0000000000001) 159 : { 160 15 : break; 161 : } 162 : else 163 : { 164 5483560 : const float fInvR2 = 1.0f / fR2; 165 5483560 : fNominator += fInvR2 * pafZ[i]; 166 5483560 : fDenominator += fInvR2; 167 : } 168 : } 169 : 170 1220290 : if (i != nPoints) 171 : { 172 15 : (*pdfValue) = pafZ[i]; 173 : } 174 1220280 : else if (fDenominator == 0.0) 175 : { 176 0 : (*pdfValue) = 177 : static_cast<const GDALGridInverseDistanceToAPowerOptions *>( 178 : poOptions) 179 0 : ->dfNoDataValue; 180 : } 181 : else 182 1220280 : (*pdfValue) = fNominator / fDenominator; 183 : 184 1220290 : return CE_None; 185 : } 186 : 187 : #endif /* HAVE_AVX_AT_COMPILE_TIME */