Line data Source code
1 : /******************************************************************************
2 : *
3 : * Project: Parquet Translator
4 : * Purpose: Implements OGRParquetDriver.
5 : * Author: Even Rouault, <even.rouault at spatialys.com>
6 : *
7 : ******************************************************************************
8 : * Copyright (c) 2022, Planet Labs
9 : *
10 : * Permission is hereby granted, free of charge, to any person obtaining a
11 : * copy of this software and associated documentation files (the "Software"),
12 : * to deal in the Software without restriction, including without limitation
13 : * the rights to use, copy, modify, merge, publish, distribute, sublicense,
14 : * and/or sell copies of the Software, and to permit persons to whom the
15 : * Software is furnished to do so, subject to the following conditions:
16 : *
17 : * The above copyright notice and this permission notice shall be included
18 : * in all copies or substantial portions of the Software.
19 : *
20 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
21 : * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 : * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 : * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 : * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 : * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 : * DEALINGS IN THE SOFTWARE.
27 : ****************************************************************************/
28 :
29 : #include "ogr_parquet.h"
30 : #include "ogr_mem.h"
31 : #include "ogr_swq.h"
32 :
33 : #include "../arrow_common/ograrrowdataset.hpp"
34 : #include "../arrow_common/ograrrowlayer.hpp"
35 :
36 : /************************************************************************/
37 : /* OGRParquetDataset() */
38 : /************************************************************************/
39 :
40 681 : OGRParquetDataset::OGRParquetDataset(
41 681 : const std::shared_ptr<arrow::MemoryPool> &poMemoryPool)
42 681 : : OGRArrowDataset(poMemoryPool)
43 : {
44 681 : }
45 :
46 : /***********************************************************************/
47 : /* ExecuteSQL() */
48 : /***********************************************************************/
49 :
50 40 : OGRLayer *OGRParquetDataset::ExecuteSQL(const char *pszSQLCommand,
51 : OGRGeometry *poSpatialFilter,
52 : const char *pszDialect)
53 : {
54 : /* -------------------------------------------------------------------- */
55 : /* Special cases for SQL optimizations */
56 : /* -------------------------------------------------------------------- */
57 40 : if (STARTS_WITH_CI(pszSQLCommand, "SELECT ") &&
58 10 : (pszDialect == nullptr || EQUAL(pszDialect, "") ||
59 0 : EQUAL(pszDialect, "OGRSQL")))
60 : {
61 35 : swq_select oSelect;
62 35 : if (oSelect.preparse(pszSQLCommand) != CE_None)
63 0 : return nullptr;
64 :
65 : /* --------------------------------------------------------------------
66 : */
67 : /* MIN/MAX/COUNT optimization */
68 : /* --------------------------------------------------------------------
69 : */
70 35 : if (oSelect.join_count == 0 && oSelect.poOtherSelect == nullptr &&
71 35 : oSelect.table_count == 1 && oSelect.order_specs == 0 &&
72 35 : oSelect.query_mode != SWQM_DISTINCT_LIST &&
73 93 : oSelect.where_expr == nullptr &&
74 23 : CPLTestBool(
75 : CPLGetConfigOption("OGR_PARQUET_USE_STATISTICS", "YES")))
76 : {
77 1 : auto poLayer = dynamic_cast<OGRParquetLayer *>(
78 23 : GetLayerByName(oSelect.table_defs[0].table_name));
79 23 : if (poLayer)
80 : {
81 16 : OGRMemLayer *poMemLayer = nullptr;
82 16 : const auto poLayerDefn = poLayer->GetLayerDefn();
83 :
84 16 : int i = 0; // Used after for.
85 47 : for (; i < oSelect.result_columns(); i++)
86 : {
87 44 : swq_col_func col_func = oSelect.column_defs[i].col_func;
88 44 : if (!(col_func == SWQCF_MIN || col_func == SWQCF_MAX ||
89 : col_func == SWQCF_COUNT))
90 13 : break;
91 :
92 : const char *pszFieldName =
93 37 : oSelect.column_defs[i].field_name;
94 37 : if (pszFieldName == nullptr)
95 0 : break;
96 37 : if (oSelect.column_defs[i].target_type != SWQ_OTHER)
97 0 : break;
98 :
99 : const int iOGRField =
100 37 : (EQUAL(pszFieldName, poLayer->GetFIDColumn()) &&
101 2 : pszFieldName[0])
102 39 : ? OGRParquetLayer::OGR_FID_INDEX
103 35 : : poLayerDefn->GetFieldIndex(pszFieldName);
104 37 : if (iOGRField < 0 &&
105 : iOGRField != OGRParquetLayer::OGR_FID_INDEX)
106 4 : break;
107 :
108 : OGRField sField;
109 33 : OGR_RawField_SetNull(&sField);
110 33 : OGRFieldType eType = OFTReal;
111 33 : OGRFieldSubType eSubType = OFSTNone;
112 : const int iCol =
113 : iOGRField == OGRParquetLayer::OGR_FID_INDEX
114 64 : ? poLayer->GetFIDParquetColumn()
115 31 : : poLayer->GetMapFieldIndexToParquetColumn()
116 31 : [iOGRField];
117 33 : if (iCol < 0)
118 0 : break;
119 : const auto metadata =
120 33 : poLayer->GetReader()->parquet_reader()->metadata();
121 33 : const auto numRowGroups = metadata->num_row_groups();
122 33 : bool bFound = false;
123 33 : std::string sVal;
124 :
125 33 : if (numRowGroups > 0)
126 : {
127 : const auto rowGroup0columnChunk =
128 66 : metadata->RowGroup(0)->ColumnChunk(iCol);
129 : const auto rowGroup0Stats =
130 66 : rowGroup0columnChunk->statistics();
131 65 : if (rowGroup0columnChunk->is_stats_set() &&
132 32 : rowGroup0Stats)
133 : {
134 : OGRField sFieldDummy;
135 : bool bFoundDummy;
136 64 : std::string sValDummy;
137 :
138 32 : if (col_func == SWQCF_MIN)
139 : {
140 15 : CPL_IGNORE_RET_VAL(
141 15 : poLayer->GetMinMaxForOGRField(
142 : /* iRowGroup=*/-1, // -1 for all
143 : iOGRField, true, sField, bFound, false,
144 : sFieldDummy, bFoundDummy, eType,
145 : eSubType, sVal, sValDummy));
146 : }
147 17 : else if (col_func == SWQCF_MAX)
148 : {
149 15 : CPL_IGNORE_RET_VAL(
150 15 : poLayer->GetMinMaxForOGRField(
151 : /* iRowGroup=*/-1, // -1 for all
152 : iOGRField, false, sFieldDummy,
153 : bFoundDummy, true, sField, bFound,
154 : eType, eSubType, sValDummy, sVal));
155 : }
156 2 : else if (col_func == SWQCF_COUNT)
157 : {
158 2 : if (oSelect.column_defs[i].distinct_flag)
159 : {
160 1 : eType = OFTInteger64;
161 1 : sField.Integer64 = 0;
162 1 : for (int iGroup = 0; iGroup < numRowGroups;
163 : iGroup++)
164 : {
165 : const auto columnChunk =
166 1 : metadata->RowGroup(iGroup)
167 1 : ->ColumnChunk(iCol);
168 : const auto colStats =
169 1 : columnChunk->statistics();
170 2 : if (columnChunk->is_stats_set() &&
171 2 : colStats &&
172 1 : colStats->HasDistinctCount())
173 : {
174 : // Statistics generated by arrow-cpp
175 : // Parquet writer seem to be buggy,
176 : // as distinct_count() is always
177 : // zero. We can detect this: if
178 : // there are non-null values, then
179 : // distinct_count() should be > 0.
180 0 : if (colStats->distinct_count() ==
181 0 : 0 &&
182 0 : colStats->num_values() > 0)
183 : {
184 0 : bFound = false;
185 0 : break;
186 : }
187 0 : sField.Integer64 +=
188 0 : colStats->distinct_count();
189 0 : bFound = true;
190 : }
191 : else
192 : {
193 1 : bFound = false;
194 1 : break;
195 : }
196 : }
197 : }
198 : else
199 : {
200 1 : eType = OFTInteger64;
201 1 : sField.Integer64 = 0;
202 1 : bFound = true;
203 3 : for (int iGroup = 0; iGroup < numRowGroups;
204 : iGroup++)
205 : {
206 : const auto columnChunk =
207 2 : metadata->RowGroup(iGroup)
208 4 : ->ColumnChunk(iCol);
209 : const auto colStats =
210 4 : columnChunk->statistics();
211 4 : if (columnChunk->is_stats_set() &&
212 2 : colStats)
213 : {
214 2 : sField.Integer64 +=
215 2 : colStats->num_values();
216 : }
217 : else
218 : {
219 0 : bFound = false;
220 : }
221 : }
222 : }
223 : }
224 : }
225 : else
226 : {
227 1 : CPLDebug("PARQUET",
228 : "Statistics not available for field %s",
229 : pszFieldName);
230 : }
231 : }
232 33 : if (!bFound)
233 : {
234 2 : break;
235 : }
236 :
237 31 : if (poMemLayer == nullptr)
238 : {
239 3 : poMemLayer =
240 3 : new OGRMemLayer("SELECT", nullptr, wkbNone);
241 : OGRFeature *poFeature =
242 3 : new OGRFeature(poMemLayer->GetLayerDefn());
243 3 : CPL_IGNORE_RET_VAL(
244 3 : poMemLayer->CreateFeature(poFeature));
245 3 : delete poFeature;
246 : }
247 :
248 : const char *pszMinMaxFieldName =
249 47 : CPLSPrintf("%s_%s",
250 : (col_func == SWQCF_MIN) ? "MIN"
251 16 : : (col_func == SWQCF_MAX) ? "MAX"
252 : : "COUNT",
253 31 : oSelect.column_defs[i].field_name);
254 62 : OGRFieldDefn oFieldDefn(pszMinMaxFieldName, eType);
255 31 : oFieldDefn.SetSubType(eSubType);
256 31 : poMemLayer->CreateField(&oFieldDefn);
257 :
258 31 : OGRFeature *poFeature = poMemLayer->GetFeature(0);
259 31 : poFeature->SetField(oFieldDefn.GetNameRef(), &sField);
260 31 : CPL_IGNORE_RET_VAL(poMemLayer->SetFeature(poFeature));
261 31 : delete poFeature;
262 : }
263 16 : if (i != oSelect.result_columns())
264 : {
265 13 : delete poMemLayer;
266 : }
267 : else
268 : {
269 3 : CPLDebug("PARQUET",
270 : "Using optimized MIN/MAX/COUNT implementation");
271 3 : return poMemLayer;
272 : }
273 : }
274 : }
275 : }
276 :
277 37 : return GDALDataset::ExecuteSQL(pszSQLCommand, poSpatialFilter, pszDialect);
278 : }
279 :
280 : /***********************************************************************/
281 : /* ReleaseResultSet() */
282 : /***********************************************************************/
283 :
284 32 : void OGRParquetDataset::ReleaseResultSet(OGRLayer *poResultsSet)
285 : {
286 32 : delete poResultsSet;
287 32 : }
288 :
289 : /************************************************************************/
290 : /* TestCapability() */
291 : /************************************************************************/
292 :
293 48 : int OGRParquetDataset::TestCapability(const char *pszCap)
294 :
295 : {
296 48 : if (EQUAL(pszCap, ODsCZGeometries))
297 5 : return true;
298 43 : else if (EQUAL(pszCap, ODsCMeasuredGeometries))
299 10 : return true;
300 :
301 33 : return false;
302 : }
|