Line data Source code
1 : /******************************************************************************
2 : *
3 : * Project: GDAL Gridding API.
4 : * Purpose: Implementation of GDAL scattered data gridder.
5 : * Author: Even Rouault, <even dot rouault at spatialys.com>
6 : *
7 : ******************************************************************************
8 : * Copyright (c) 2013, Even Rouault <even dot rouault at spatialys.com>
9 : *
10 : * Permission is hereby granted, free of charge, to any person obtaining a
11 : * copy of this software and associated documentation files (the "Software"),
12 : * to deal in the Software without restriction, including without limitation
13 : * the rights to use, copy, modify, merge, publish, distribute, sublicense,
14 : * and/or sell copies of the Software, and to permit persons to whom the
15 : * Software is furnished to do so, subject to the following conditions:
16 : *
17 : * The above copyright notice and this permission notice shall be included
18 : * in all copies or substantial portions of the Software.
19 : *
20 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
21 : * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 : * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 : * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 : * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 : * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 : * DEALINGS IN THE SOFTWARE.
27 : ****************************************************************************/
28 :
29 : #include "gdalgrid.h"
30 : #include "gdalgrid_priv.h"
31 :
32 : #ifdef HAVE_AVX_AT_COMPILE_TIME
33 : #include <immintrin.h>
34 :
35 : /************************************************************************/
36 : /* GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX() */
37 : /************************************************************************/
38 :
39 : #define GDAL_mm256_load1_ps(x) _mm256_set_ps(x, x, x, x, x, x, x, x)
40 :
41 1320 : CPLErr GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX(
42 : const void *poOptions, GUInt32 nPoints,
43 : CPL_UNUSED const double *unused_padfX,
44 : CPL_UNUSED const double *unused_padfY,
45 : CPL_UNUSED const double *unused_padfZ, double dfXPoint, double dfYPoint,
46 : double *pdfValue, void *hExtraParamsIn)
47 : {
48 1320 : size_t i = 0;
49 1320 : GDALGridExtraParameters *psExtraParams =
50 : static_cast<GDALGridExtraParameters *>(hExtraParamsIn);
51 1320 : const float *pafX = psExtraParams->pafX;
52 1320 : const float *pafY = psExtraParams->pafY;
53 1320 : const float *pafZ = psExtraParams->pafZ;
54 :
55 1320 : const float fEpsilon = 0.0000000000001f;
56 1320 : const float fXPoint = static_cast<float>(dfXPoint);
57 1320 : const float fYPoint = static_cast<float>(dfYPoint);
58 1320 : const __m256 ymm_small = GDAL_mm256_load1_ps(fEpsilon);
59 1320 : const __m256 ymm_x = GDAL_mm256_load1_ps(fXPoint);
60 1320 : const __m256 ymm_y = GDAL_mm256_load1_ps(fYPoint);
61 1320 : __m256 ymm_nominator = _mm256_setzero_ps();
62 1320 : __m256 ymm_denominator = _mm256_setzero_ps();
63 1320 : int mask = 0;
64 :
65 : #undef LOOP_SIZE
66 : #if defined(__x86_64) || defined(_M_X64)
67 : /* This would also work in 32bit mode, but there are only 8 XMM registers */
68 : /* whereas we have 16 for 64bit */
69 : #define LOOP_SIZE 16
70 1320 : size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
71 15715 : for (i = 0; i < nPointsRound; i += LOOP_SIZE)
72 : {
73 31102 : __m256 ymm_rx = _mm256_sub_ps(_mm256_load_ps(pafX + i),
74 : ymm_x); /* rx = pafX[i] - fXPoint */
75 31102 : __m256 ymm_rx_8 = _mm256_sub_ps(_mm256_load_ps(pafX + i + 8), ymm_x);
76 31102 : __m256 ymm_ry = _mm256_sub_ps(_mm256_load_ps(pafY + i),
77 : ymm_y); /* ry = pafY[i] - fYPoint */
78 46653 : __m256 ymm_ry_8 = _mm256_sub_ps(_mm256_load_ps(pafY + i + 8), ymm_y);
79 46653 : __m256 ymm_r2 = _mm256_add_ps(
80 : _mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */
81 : _mm256_mul_ps(ymm_ry, ymm_ry));
82 46653 : __m256 ymm_r2_8 = _mm256_add_ps(_mm256_mul_ps(ymm_rx_8, ymm_rx_8),
83 : _mm256_mul_ps(ymm_ry_8, ymm_ry_8));
84 15551 : __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */
85 15551 : __m256 ymm_invr2_8 = _mm256_rcp_ps(ymm_r2_8);
86 : ymm_nominator =
87 31102 : _mm256_add_ps(ymm_nominator, /* nominator += invr2 * pafZ[i] */
88 15551 : _mm256_mul_ps(ymm_invr2, _mm256_load_ps(pafZ + i)));
89 46653 : ymm_nominator = _mm256_add_ps(
90 : ymm_nominator,
91 15551 : _mm256_mul_ps(ymm_invr2_8, _mm256_load_ps(pafZ + i + 8)));
92 15551 : ymm_denominator = _mm256_add_ps(ymm_denominator,
93 : ymm_invr2); /* denominator += invr2 */
94 15551 : ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2_8);
95 15595 : mask =
96 15551 : _mm256_movemask_ps(_mm256_cmp_ps(
97 : ymm_r2, ymm_small, _CMP_LT_OS)) | /* if( r2 < fEpsilon) */
98 15587 : (_mm256_movemask_ps(_mm256_cmp_ps(ymm_r2_8, ymm_small, _CMP_LT_OS))
99 15595 : << 8);
100 15595 : if (mask)
101 1200 : break;
102 : }
103 : #else
104 : #define LOOP_SIZE 8
105 : size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
106 : for (i = 0; i < nPointsRound; i += LOOP_SIZE)
107 : {
108 : __m256 ymm_rx =
109 : _mm256_sub_ps(_mm256_load_ps(const_cast<float *>(pafX) + i),
110 : ymm_x); /* rx = pafX[i] - fXPoint */
111 : __m256 ymm_ry =
112 : _mm256_sub_ps(_mm256_load_ps(const_cast<float *>(pafY) + i),
113 : ymm_y); /* ry = pafY[i] - fYPoint */
114 : __m256 ymm_r2 = _mm256_add_ps(
115 : _mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */
116 : _mm256_mul_ps(ymm_ry, ymm_ry));
117 : __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */
118 : ymm_nominator = _mm256_add_ps(
119 : ymm_nominator, /* nominator += invr2 * pafZ[i] */
120 : _mm256_mul_ps(ymm_invr2,
121 : _mm256_load_ps(const_cast<float *>(pafZ) + i)));
122 : ymm_denominator = _mm256_add_ps(ymm_denominator,
123 : ymm_invr2); /* denominator += invr2 */
124 : mask = _mm256_movemask_ps(_mm256_cmp_ps(
125 : ymm_r2, ymm_small, _CMP_LT_OS)); /* if( r2 < fEpsilon) */
126 : if (mask)
127 : break;
128 : }
129 : #endif
130 :
131 : // Find which i triggered r2 < fEpsilon.
132 1364 : if (mask)
133 : {
134 10082 : for (int j = 0; j < LOOP_SIZE; j++)
135 : {
136 10079 : if (mask & (1 << j))
137 : {
138 1196 : (*pdfValue) = (pafZ)[i + j];
139 :
140 : // GCC and MSVC need explicit zeroing.
141 : #if !defined(__clang__)
142 : _mm256_zeroupper();
143 : #endif
144 1195 : return CE_None;
145 : }
146 : }
147 : }
148 : #undef LOOP_SIZE
149 :
150 : // Get back nominator and denominator values for YMM registers.
151 : float afNominator[8];
152 : float afDenominator[8];
153 : _mm256_storeu_ps(afNominator, ymm_nominator);
154 : _mm256_storeu_ps(afDenominator, ymm_denominator);
155 :
156 : // MSVC doesn't emit AVX afterwards but may use SSE, so clear
157 : // upper bits. Other compilers will continue using AVX for the
158 : // below floating points operations.
159 : #if defined(_MSC_FULL_VER)
160 : _mm256_zeroupper();
161 : #endif
162 :
163 168 : float fNominator = afNominator[0] + afNominator[1] + afNominator[2] +
164 168 : afNominator[3] + afNominator[4] + afNominator[5] +
165 168 : afNominator[6] + afNominator[7];
166 168 : float fDenominator = afDenominator[0] + afDenominator[1] +
167 168 : afDenominator[2] + afDenominator[3] +
168 168 : afDenominator[4] + afDenominator[5] +
169 168 : afDenominator[6] + afDenominator[7];
170 :
171 : // Do the few remaining loop iterations.
172 328 : for (; i < nPoints; i++)
173 : {
174 175 : const float fRX = pafX[i] - fXPoint;
175 175 : const float fRY = pafY[i] - fYPoint;
176 175 : const float fR2 = fRX * fRX + fRY * fRY;
177 :
178 : // If the test point is close to the grid node, use the point
179 : // value directly as a node value to avoid singularity.
180 175 : if (fR2 < 0.0000000000001)
181 : {
182 15 : break;
183 : }
184 : else
185 : {
186 160 : const float fInvR2 = 1.0f / fR2;
187 160 : fNominator += fInvR2 * pafZ[i];
188 160 : fDenominator += fInvR2;
189 : }
190 : }
191 :
192 168 : if (i != nPoints)
193 : {
194 15 : (*pdfValue) = pafZ[i];
195 : }
196 153 : else if (fDenominator == 0.0)
197 : {
198 0 : (*pdfValue) =
199 : static_cast<const GDALGridInverseDistanceToAPowerOptions *>(
200 : poOptions)
201 0 : ->dfNoDataValue;
202 : }
203 : else
204 153 : (*pdfValue) = fNominator / fDenominator;
205 :
206 : // GCC needs explicit zeroing.
207 : #if defined(__GNUC__) && !defined(__clang__)
208 : _mm256_zeroupper();
209 : #endif
210 :
211 125 : return CE_None;
212 : }
213 :
214 : #endif /* HAVE_AVX_AT_COMPILE_TIME */
|