Line data Source code
1 : /******************************************************************************
2 : *
3 : * Project: GDAL Gridding API.
4 : * Purpose: Implementation of GDAL scattered data gridder.
5 : * Author: Even Rouault, <even dot rouault at spatialys.com>
6 : *
7 : ******************************************************************************
8 : * Copyright (c) 2013, Even Rouault <even dot rouault at spatialys.com>
9 : *
10 : * SPDX-License-Identifier: MIT
11 : ****************************************************************************/
12 :
13 : #include "gdalgrid.h"
14 : #include "gdalgrid_priv.h"
15 :
16 : #ifdef HAVE_AVX_AT_COMPILE_TIME
17 : #include <immintrin.h>
18 :
19 : /************************************************************************/
20 : /* GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX() */
21 : /************************************************************************/
22 :
23 : #define GDAL_mm256_load1_ps(x) _mm256_set_ps(x, x, x, x, x, x, x, x)
24 :
25 1323 : CPLErr GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX(
26 : const void *poOptions, GUInt32 nPoints,
27 : CPL_UNUSED const double *unused_padfX,
28 : CPL_UNUSED const double *unused_padfY,
29 : CPL_UNUSED const double *unused_padfZ, double dfXPoint, double dfYPoint,
30 : double *pdfValue, void *hExtraParamsIn)
31 : {
32 1323 : size_t i = 0;
33 1323 : GDALGridExtraParameters *psExtraParams =
34 : static_cast<GDALGridExtraParameters *>(hExtraParamsIn);
35 1323 : const float *pafX = psExtraParams->pafX;
36 1323 : const float *pafY = psExtraParams->pafY;
37 1323 : const float *pafZ = psExtraParams->pafZ;
38 :
39 1323 : const float fEpsilon = 0.0000000000001f;
40 1323 : const float fXPoint = static_cast<float>(dfXPoint);
41 1323 : const float fYPoint = static_cast<float>(dfYPoint);
42 1323 : const __m256 ymm_small = GDAL_mm256_load1_ps(fEpsilon);
43 1323 : const __m256 ymm_x = GDAL_mm256_load1_ps(fXPoint);
44 1323 : const __m256 ymm_y = GDAL_mm256_load1_ps(fYPoint);
45 1323 : __m256 ymm_nominator = _mm256_setzero_ps();
46 1323 : __m256 ymm_denominator = _mm256_setzero_ps();
47 1323 : int mask = 0;
48 :
49 : #undef LOOP_SIZE
50 : #if defined(__x86_64) || defined(_M_X64)
51 : /* This would also work in 32bit mode, but there are only 8 XMM registers */
52 : /* whereas we have 16 for 64bit */
53 : #define LOOP_SIZE 16
54 1323 : size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
55 14554 : for (i = 0; i < nPointsRound; i += LOOP_SIZE)
56 : {
57 29166 : __m256 ymm_rx = _mm256_sub_ps(_mm256_load_ps(pafX + i),
58 : ymm_x); /* rx = pafX[i] - fXPoint */
59 29166 : __m256 ymm_rx_8 = _mm256_sub_ps(_mm256_load_ps(pafX + i + 8), ymm_x);
60 29166 : __m256 ymm_ry = _mm256_sub_ps(_mm256_load_ps(pafY + i),
61 : ymm_y); /* ry = pafY[i] - fYPoint */
62 43749 : __m256 ymm_ry_8 = _mm256_sub_ps(_mm256_load_ps(pafY + i + 8), ymm_y);
63 43749 : __m256 ymm_r2 = _mm256_add_ps(
64 : _mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */
65 : _mm256_mul_ps(ymm_ry, ymm_ry));
66 43749 : __m256 ymm_r2_8 = _mm256_add_ps(_mm256_mul_ps(ymm_rx_8, ymm_rx_8),
67 : _mm256_mul_ps(ymm_ry_8, ymm_ry_8));
68 14583 : __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */
69 14583 : __m256 ymm_invr2_8 = _mm256_rcp_ps(ymm_r2_8);
70 : ymm_nominator =
71 29166 : _mm256_add_ps(ymm_nominator, /* nominator += invr2 * pafZ[i] */
72 14583 : _mm256_mul_ps(ymm_invr2, _mm256_load_ps(pafZ + i)));
73 43749 : ymm_nominator = _mm256_add_ps(
74 : ymm_nominator,
75 14583 : _mm256_mul_ps(ymm_invr2_8, _mm256_load_ps(pafZ + i + 8)));
76 14583 : ymm_denominator = _mm256_add_ps(ymm_denominator,
77 : ymm_invr2); /* denominator += invr2 */
78 14583 : ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2_8);
79 14431 : mask =
80 14583 : _mm256_movemask_ps(_mm256_cmp_ps(
81 : ymm_r2, ymm_small, _CMP_LT_OS)) | /* if( r2 < fEpsilon) */
82 14400 : (_mm256_movemask_ps(_mm256_cmp_ps(ymm_r2_8, ymm_small, _CMP_LT_OS))
83 14431 : << 8);
84 14431 : if (mask)
85 1200 : break;
86 : }
87 : #else
88 : #define LOOP_SIZE 8
89 : size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
90 : for (i = 0; i < nPointsRound; i += LOOP_SIZE)
91 : {
92 : __m256 ymm_rx =
93 : _mm256_sub_ps(_mm256_load_ps(const_cast<float *>(pafX) + i),
94 : ymm_x); /* rx = pafX[i] - fXPoint */
95 : __m256 ymm_ry =
96 : _mm256_sub_ps(_mm256_load_ps(const_cast<float *>(pafY) + i),
97 : ymm_y); /* ry = pafY[i] - fYPoint */
98 : __m256 ymm_r2 = _mm256_add_ps(
99 : _mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */
100 : _mm256_mul_ps(ymm_ry, ymm_ry));
101 : __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */
102 : ymm_nominator = _mm256_add_ps(
103 : ymm_nominator, /* nominator += invr2 * pafZ[i] */
104 : _mm256_mul_ps(ymm_invr2,
105 : _mm256_load_ps(const_cast<float *>(pafZ) + i)));
106 : ymm_denominator = _mm256_add_ps(ymm_denominator,
107 : ymm_invr2); /* denominator += invr2 */
108 : mask = _mm256_movemask_ps(_mm256_cmp_ps(
109 : ymm_r2, ymm_small, _CMP_LT_OS)); /* if( r2 < fEpsilon) */
110 : if (mask)
111 : break;
112 : }
113 : #endif
114 :
115 : // Find which i triggered r2 < fEpsilon.
116 1171 : if (mask)
117 : {
118 10133 : for (int j = 0; j < LOOP_SIZE; j++)
119 : {
120 10132 : if (mask & (1 << j))
121 : {
122 1199 : (*pdfValue) = (pafZ)[i + j];
123 :
124 : // GCC and MSVC need explicit zeroing.
125 : #if !defined(__clang__)
126 : _mm256_zeroupper();
127 : #endif
128 1200 : return CE_None;
129 : }
130 : }
131 : }
132 : #undef LOOP_SIZE
133 :
134 : // Get back nominator and denominator values for YMM registers.
135 : float afNominator[8];
136 : float afDenominator[8];
137 : _mm256_storeu_ps(afNominator, ymm_nominator);
138 : _mm256_storeu_ps(afDenominator, ymm_denominator);
139 :
140 : // MSVC doesn't emit AVX afterwards but may use SSE, so clear
141 : // upper bits. Other compilers will continue using AVX for the
142 : // below floating points operations.
143 : #if defined(_MSC_FULL_VER)
144 : _mm256_zeroupper();
145 : #endif
146 :
147 0 : float fNominator = afNominator[0] + afNominator[1] + afNominator[2] +
148 0 : afNominator[3] + afNominator[4] + afNominator[5] +
149 0 : afNominator[6] + afNominator[7];
150 0 : float fDenominator = afDenominator[0] + afDenominator[1] +
151 0 : afDenominator[2] + afDenominator[3] +
152 0 : afDenominator[4] + afDenominator[5] +
153 0 : afDenominator[6] + afDenominator[7];
154 :
155 : // Do the few remaining loop iterations.
156 139 : for (; i < nPoints; i++)
157 : {
158 182 : const float fRX = pafX[i] - fXPoint;
159 182 : const float fRY = pafY[i] - fYPoint;
160 182 : const float fR2 = fRX * fRX + fRY * fRY;
161 :
162 : // If the test point is close to the grid node, use the point
163 : // value directly as a node value to avoid singularity.
164 182 : if (fR2 < 0.0000000000001)
165 : {
166 15 : break;
167 : }
168 : else
169 : {
170 167 : const float fInvR2 = 1.0f / fR2;
171 167 : fNominator += fInvR2 * pafZ[i];
172 167 : fDenominator += fInvR2;
173 : }
174 : }
175 :
176 0 : if (i != nPoints)
177 : {
178 15 : (*pdfValue) = pafZ[i];
179 : }
180 0 : else if (fDenominator == 0.0)
181 : {
182 0 : (*pdfValue) =
183 : static_cast<const GDALGridInverseDistanceToAPowerOptions *>(
184 : poOptions)
185 0 : ->dfNoDataValue;
186 : }
187 : else
188 0 : (*pdfValue) = fNominator / fDenominator;
189 :
190 : // GCC needs explicit zeroing.
191 : #if defined(__GNUC__) && !defined(__clang__)
192 : _mm256_zeroupper();
193 : #endif
194 :
195 125 : return CE_None;
196 : }
197 :
198 : #endif /* HAVE_AVX_AT_COMPILE_TIME */
|