Line data Source code
1 : /******************************************************************************
2 : *
3 : * Project: GDAL Gridding API.
4 : * Purpose: Implementation of GDAL scattered data gridder.
5 : * Author: Even Rouault, <even dot rouault at spatialys.com>
6 : *
7 : ******************************************************************************
8 : * Copyright (c) 2013, Even Rouault <even dot rouault at spatialys.com>
9 : *
10 : * Permission is hereby granted, free of charge, to any person obtaining a
11 : * copy of this software and associated documentation files (the "Software"),
12 : * to deal in the Software without restriction, including without limitation
13 : * the rights to use, copy, modify, merge, publish, distribute, sublicense,
14 : * and/or sell copies of the Software, and to permit persons to whom the
15 : * Software is furnished to do so, subject to the following conditions:
16 : *
17 : * The above copyright notice and this permission notice shall be included
18 : * in all copies or substantial portions of the Software.
19 : *
20 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
21 : * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 : * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 : * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 : * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 : * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 : * DEALINGS IN THE SOFTWARE.
27 : ****************************************************************************/
28 :
29 : #include "gdalgrid.h"
30 : #include "gdalgrid_priv.h"
31 :
32 : #ifdef HAVE_SSE_AT_COMPILE_TIME
33 : #include <xmmintrin.h>
34 :
35 : /************************************************************************/
36 : /* GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE() */
37 : /************************************************************************/
38 :
39 501 : CPLErr GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE(
40 : const void *poOptions, GUInt32 nPoints,
41 : CPL_UNUSED const double *unused_padfX,
42 : CPL_UNUSED const double *unused_padfY,
43 : CPL_UNUSED const double *unused_padfZ, double dfXPoint, double dfYPoint,
44 : double *pdfValue, void *hExtraParamsIn)
45 : {
46 501 : size_t i = 0;
47 501 : GDALGridExtraParameters *psExtraParams =
48 : static_cast<GDALGridExtraParameters *>(hExtraParamsIn);
49 501 : const float *pafX = psExtraParams->pafX;
50 501 : const float *pafY = psExtraParams->pafY;
51 501 : const float *pafZ = psExtraParams->pafZ;
52 :
53 501 : const float fEpsilon = 0.0000000000001f;
54 501 : const float fXPoint = static_cast<float>(dfXPoint);
55 501 : const float fYPoint = static_cast<float>(dfYPoint);
56 501 : const __m128 xmm_small = _mm_load1_ps(const_cast<float *>(&fEpsilon));
57 501 : const __m128 xmm_x = _mm_load1_ps(const_cast<float *>(&fXPoint));
58 501 : const __m128 xmm_y = _mm_load1_ps(const_cast<float *>(&fYPoint));
59 501 : __m128 xmm_nominator = _mm_setzero_ps();
60 501 : __m128 xmm_denominator = _mm_setzero_ps();
61 501 : int mask = 0;
62 :
63 : #if defined(__x86_64) || defined(_M_X64)
64 : // This would also work in 32bit mode, but there are only 8 XMM registers
65 : // whereas we have 16 for 64bit.
66 501 : const size_t LOOP_SIZE = 8;
67 501 : size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
68 10301 : for (i = 0; i < nPointsRound; i += LOOP_SIZE)
69 : {
70 : // rx = pafX[i] - fXPoint
71 20400 : __m128 xmm_rx = _mm_sub_ps(_mm_load_ps(pafX + i), xmm_x);
72 20400 : __m128 xmm_rx_4 = _mm_sub_ps(_mm_load_ps(pafX + i + 4), xmm_x);
73 : // ry = pafY[i] - fYPoint
74 20400 : __m128 xmm_ry = _mm_sub_ps(_mm_load_ps(pafY + i), xmm_y);
75 30600 : __m128 xmm_ry_4 = _mm_sub_ps(_mm_load_ps(pafY + i + 4), xmm_y);
76 : // r2 = rx * rx + ry * ry
77 : __m128 xmm_r2 =
78 30600 : _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx), _mm_mul_ps(xmm_ry, xmm_ry));
79 30600 : __m128 xmm_r2_4 = _mm_add_ps(_mm_mul_ps(xmm_rx_4, xmm_rx_4),
80 : _mm_mul_ps(xmm_ry_4, xmm_ry_4));
81 : // invr2 = 1.0f / r2
82 10200 : __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2);
83 10200 : __m128 xmm_invr2_4 = _mm_rcp_ps(xmm_r2_4);
84 : // nominator += invr2 * pafZ[i]
85 20400 : xmm_nominator = _mm_add_ps(
86 10200 : xmm_nominator, _mm_mul_ps(xmm_invr2, _mm_load_ps(pafZ + i)));
87 30600 : xmm_nominator = _mm_add_ps(
88 10200 : xmm_nominator, _mm_mul_ps(xmm_invr2_4, _mm_load_ps(pafZ + i + 4)));
89 : // denominator += invr2
90 10200 : xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2);
91 10200 : xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2_4);
92 : // if( r2 < fEpsilon)
93 20400 : mask = _mm_movemask_ps(_mm_cmplt_ps(xmm_r2, xmm_small)) |
94 10200 : (_mm_movemask_ps(_mm_cmplt_ps(xmm_r2_4, xmm_small)) << 4);
95 10200 : if (mask)
96 400 : break;
97 : }
98 : #else
99 : #define LOOP_SIZE 4
100 : size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
101 : for (i = 0; i < nPointsRound; i += LOOP_SIZE)
102 : {
103 : __m128 xmm_rx = _mm_sub_ps(_mm_load_ps(pafX + i),
104 : xmm_x); /* rx = pafX[i] - fXPoint */
105 : __m128 xmm_ry = _mm_sub_ps(_mm_load_ps(pafY + i),
106 : xmm_y); /* ry = pafY[i] - fYPoint */
107 : __m128 xmm_r2 =
108 : _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx), /* r2 = rx * rx + ry * ry */
109 : _mm_mul_ps(xmm_ry, xmm_ry));
110 : __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2); /* invr2 = 1.0f / r2 */
111 : xmm_nominator =
112 : _mm_add_ps(xmm_nominator, /* nominator += invr2 * pafZ[i] */
113 : _mm_mul_ps(xmm_invr2, _mm_load_ps(pafZ + i)));
114 : xmm_denominator =
115 : _mm_add_ps(xmm_denominator, xmm_invr2); /* denominator += invr2 */
116 : mask = _mm_movemask_ps(
117 : _mm_cmplt_ps(xmm_r2, xmm_small)); /* if( r2 < fEpsilon) */
118 : if (mask)
119 : break;
120 : }
121 : #endif
122 :
123 : // Find which i triggered r2 < fEpsilon.
124 501 : if (mask)
125 : {
126 1800 : for (size_t j = 0; j < LOOP_SIZE; j++)
127 : {
128 1800 : if (mask & (1 << j))
129 : {
130 400 : (*pdfValue) = (pafZ)[i + j];
131 400 : return CE_None;
132 : }
133 : }
134 : }
135 :
136 : // Get back nominator and denominator values for XMM registers.
137 : float afNominator[4];
138 : float afDenominator[4];
139 : _mm_storeu_ps(afNominator, xmm_nominator);
140 : _mm_storeu_ps(afDenominator, xmm_denominator);
141 :
142 101 : float fNominator =
143 101 : afNominator[0] + afNominator[1] + afNominator[2] + afNominator[3];
144 101 : float fDenominator = afDenominator[0] + afDenominator[1] +
145 101 : afDenominator[2] + afDenominator[3];
146 :
147 : /* Do the few remaining loop iterations */
148 201 : for (; i < nPoints; i++)
149 : {
150 101 : const float fRX = pafX[i] - fXPoint;
151 101 : const float fRY = pafY[i] - fYPoint;
152 101 : const float fR2 = fRX * fRX + fRY * fRY;
153 :
154 : // If the test point is close to the grid node, use the point
155 : // value directly as a node value to avoid singularity.
156 101 : if (fR2 < 0.0000000000001)
157 : {
158 1 : break;
159 : }
160 : else
161 : {
162 100 : const float fInvR2 = 1.0f / fR2;
163 100 : fNominator += fInvR2 * pafZ[i];
164 100 : fDenominator += fInvR2;
165 : }
166 : }
167 :
168 101 : if (i != nPoints)
169 : {
170 1 : (*pdfValue) = pafZ[i];
171 : }
172 100 : else if (fDenominator == 0.0)
173 : {
174 0 : (*pdfValue) =
175 : static_cast<const GDALGridInverseDistanceToAPowerOptions *>(
176 : poOptions)
177 0 : ->dfNoDataValue;
178 : }
179 : else
180 : {
181 100 : (*pdfValue) = fNominator / fDenominator;
182 : }
183 :
184 101 : return CE_None;
185 : }
186 :
187 : #endif /* HAVE_SSE_AT_COMPILE_TIME */
|