Line data Source code
1 : /******************************************************************************
2 : *
3 : * Project: Parquet Translator
4 : * Purpose: Implements OGRParquetDriver.
5 : * Author: Even Rouault, <even.rouault at spatialys.com>
6 : *
7 : ******************************************************************************
8 : * Copyright (c) 2022, Planet Labs
9 : *
10 : * SPDX-License-Identifier: MIT
11 : ****************************************************************************/
12 :
13 : #ifdef STANDALONE
14 : #include "gdal_version.h"
15 : #else
16 : #undef DO_NOT_DEFINE_GDAL_DATE_NAME
17 : #include "gdal_version_full/gdal_version.h"
18 : #endif
19 :
20 : #include "ogr_parquet.h"
21 :
22 : #include "../arrow_common/ograrrowwriterlayer.hpp"
23 :
24 : #include "ogr_wkb.h"
25 :
26 : #include <cassert>
27 : #include <utility>
28 :
29 : /************************************************************************/
30 : /* OGRParquetWriterLayer() */
31 : /************************************************************************/
32 :
33 364 : OGRParquetWriterLayer::OGRParquetWriterLayer(
34 : OGRParquetWriterDataset *poDataset, arrow::MemoryPool *poMemoryPool,
35 : const std::shared_ptr<arrow::io::OutputStream> &poOutputStream,
36 364 : const char *pszLayerName)
37 : : OGRArrowWriterLayer(poMemoryPool, poOutputStream, pszLayerName),
38 364 : m_poDataset(poDataset)
39 : {
40 364 : m_bWriteFieldArrowExtensionName = CPLTestBool(
41 : CPLGetConfigOption("OGR_PARQUET_WRITE_ARROW_EXTENSION_NAME", "NO"));
42 364 : }
43 :
44 : /************************************************************************/
45 : /* Close() */
46 : /************************************************************************/
47 :
48 361 : bool OGRParquetWriterLayer::Close()
49 : {
50 361 : if (m_poTmpGPKGLayer)
51 : {
52 3 : if (!CopyTmpGpkgLayerToFinalFile())
53 0 : return false;
54 : }
55 :
56 361 : if (m_bInitializationOK)
57 : {
58 361 : if (!FinalizeWriting())
59 0 : return false;
60 : }
61 :
62 361 : return true;
63 : }
64 :
65 : /************************************************************************/
66 : /* CopyTmpGpkgLayerToFinalFile() */
67 : /************************************************************************/
68 :
69 3 : bool OGRParquetWriterLayer::CopyTmpGpkgLayerToFinalFile()
70 : {
71 3 : if (!m_poTmpGPKGLayer)
72 : {
73 0 : return true;
74 : }
75 :
76 3 : CPLDebug("PARQUET", "CopyTmpGpkgLayerToFinalFile(): start...");
77 :
78 3 : VSIUnlink(m_poTmpGPKG->GetDescription());
79 :
80 6 : OGRFeature oFeat(m_poFeatureDefn);
81 :
82 : // Interval in terms of features between 2 debug progress report messages
83 3 : constexpr int PROGRESS_FC_INTERVAL = 100 * 1000;
84 :
85 : // First, write features without geometries
86 : {
87 3 : auto poTmpLayer = std::unique_ptr<OGRLayer>(m_poTmpGPKG->ExecuteSQL(
88 : "SELECT serialized_feature FROM tmp WHERE fid NOT IN (SELECT id "
89 : "FROM rtree_tmp_geom)",
90 3 : nullptr, nullptr));
91 3 : if (!poTmpLayer)
92 0 : return false;
93 1005 : for (const auto &poSrcFeature : poTmpLayer.get())
94 : {
95 1002 : int nBytesFeature = 0;
96 : const GByte *pabyFeatureData =
97 1002 : poSrcFeature->GetFieldAsBinary(0, &nBytesFeature);
98 1002 : if (!oFeat.DeserializeFromBinary(pabyFeatureData, nBytesFeature))
99 : {
100 0 : CPLError(CE_Failure, CPLE_AppDefined,
101 : "Cannot deserialize feature");
102 0 : return false;
103 : }
104 1002 : if (OGRArrowWriterLayer::ICreateFeature(&oFeat) != OGRERR_NONE)
105 : {
106 0 : return false;
107 : }
108 :
109 1002 : if ((m_nFeatureCount % PROGRESS_FC_INTERVAL) == 0)
110 : {
111 0 : CPLDebugProgress(
112 : "PARQUET",
113 : "CopyTmpGpkgLayerToFinalFile(): %.02f%% progress",
114 0 : 100.0 * double(m_nFeatureCount) /
115 0 : double(m_nTmpFeatureCount));
116 : }
117 : }
118 :
119 3 : if (!FlushFeatures())
120 : {
121 0 : return false;
122 : }
123 : }
124 :
125 : // Now walk through the GPKG RTree for features with geometries
126 : // Cf https://github.com/sqlite/sqlite/blob/master/ext/rtree/rtree.c
127 : // for the description of the content of the rtree _node table
128 6 : std::vector<std::pair<int64_t, int>> aNodeNoDepthPair;
129 3 : int nTreeDepth = 0;
130 : // Queue the root node
131 : aNodeNoDepthPair.emplace_back(
132 3 : std::make_pair(/* nodeNo = */ 1, /* depth = */ 0));
133 3 : int nCountWrittenFeaturesSinceLastFlush = 0;
134 50 : while (!aNodeNoDepthPair.empty())
135 : {
136 47 : const auto &oLastPair = aNodeNoDepthPair.back();
137 47 : const int64_t nNodeNo = oLastPair.first;
138 47 : const int nCurDepth = oLastPair.second;
139 : //CPLDebug("PARQUET", "Reading nodeNode=%d, curDepth=%d", int(nNodeNo), nCurDepth);
140 47 : aNodeNoDepthPair.pop_back();
141 :
142 47 : auto poRTreeLayer = std::unique_ptr<OGRLayer>(m_poTmpGPKG->ExecuteSQL(
143 : CPLSPrintf("SELECT data FROM rtree_tmp_geom_node WHERE nodeno "
144 : "= " CPL_FRMT_GIB,
145 : static_cast<GIntBig>(nNodeNo)),
146 47 : nullptr, nullptr));
147 47 : if (!poRTreeLayer)
148 : {
149 0 : CPLError(CE_Failure, CPLE_AppDefined,
150 : "Cannot read node " CPL_FRMT_GIB,
151 : static_cast<GIntBig>(nNodeNo));
152 0 : return false;
153 : }
154 : const auto poRTreeFeature =
155 47 : std::unique_ptr<const OGRFeature>(poRTreeLayer->GetNextFeature());
156 47 : if (!poRTreeFeature)
157 : {
158 0 : CPLError(CE_Failure, CPLE_AppDefined,
159 : "Cannot read node " CPL_FRMT_GIB,
160 : static_cast<GIntBig>(nNodeNo));
161 0 : return false;
162 : }
163 :
164 47 : int nNodeBytes = 0;
165 : const GByte *pabyNodeData =
166 47 : poRTreeFeature->GetFieldAsBinary(0, &nNodeBytes);
167 47 : constexpr int BLOB_HEADER_SIZE = 4;
168 47 : if (nNodeBytes < BLOB_HEADER_SIZE)
169 : {
170 0 : CPLError(CE_Failure, CPLE_AppDefined,
171 : "Not enough bytes when reading node " CPL_FRMT_GIB,
172 : static_cast<GIntBig>(nNodeNo));
173 0 : return false;
174 : }
175 47 : if (nNodeNo == 1)
176 : {
177 : // Get the RTree depth from the root node
178 3 : nTreeDepth = (pabyNodeData[0] << 8) | pabyNodeData[1];
179 : //CPLDebug("PARQUET", "nTreeDepth = %d", nTreeDepth);
180 : }
181 :
182 47 : const int nCellCount = (pabyNodeData[2] << 8) | pabyNodeData[3];
183 47 : constexpr int SIZEOF_CELL = 24; // int64_t + 4 float
184 47 : if (nNodeBytes < BLOB_HEADER_SIZE + SIZEOF_CELL * nCellCount)
185 : {
186 0 : CPLError(CE_Failure, CPLE_AppDefined,
187 : "Not enough bytes when reading node " CPL_FRMT_GIB,
188 : static_cast<GIntBig>(nNodeNo));
189 0 : return false;
190 : }
191 :
192 47 : size_t nOffset = BLOB_HEADER_SIZE;
193 47 : if (nCurDepth == nTreeDepth)
194 : {
195 : // Leaf node: it references feature IDs.
196 :
197 : // If we are about to go above m_nRowGroupSize, flush past
198 : // features now, to improve the spatial compacity of the row group.
199 45 : if (m_nRowGroupSize > nCellCount &&
200 45 : nCountWrittenFeaturesSinceLastFlush + nCellCount >
201 45 : m_nRowGroupSize)
202 : {
203 14 : nCountWrittenFeaturesSinceLastFlush = 0;
204 14 : if (!FlushFeatures())
205 : {
206 0 : return false;
207 : }
208 : }
209 :
210 : // nCellCount shouldn't be over 51 normally, but even 65535
211 : // would be fine...
212 45 : assert(nCellCount <= 65535);
213 1247 : for (int i = 0; i < nCellCount; ++i)
214 : {
215 : int64_t nFID;
216 1202 : memcpy(&nFID, pabyNodeData + nOffset, sizeof(int64_t));
217 1202 : CPL_MSBPTR64(&nFID);
218 :
219 : const auto poSrcFeature = std::unique_ptr<const OGRFeature>(
220 1202 : m_poTmpGPKGLayer->GetFeature(nFID));
221 1202 : if (!poSrcFeature)
222 : {
223 0 : CPLError(CE_Failure, CPLE_AppDefined,
224 : "Cannot get feature " CPL_FRMT_GIB,
225 : static_cast<GIntBig>(nFID));
226 0 : return false;
227 : }
228 :
229 1202 : int nBytesFeature = 0;
230 : const GByte *pabyFeatureData =
231 1202 : poSrcFeature->GetFieldAsBinary(0, &nBytesFeature);
232 1202 : if (!oFeat.DeserializeFromBinary(pabyFeatureData,
233 : nBytesFeature))
234 : {
235 0 : CPLError(CE_Failure, CPLE_AppDefined,
236 : "Cannot deserialize feature");
237 0 : return false;
238 : }
239 1202 : if (OGRArrowWriterLayer::ICreateFeature(&oFeat) != OGRERR_NONE)
240 : {
241 0 : return false;
242 : }
243 :
244 1202 : nOffset += SIZEOF_CELL;
245 :
246 1202 : ++nCountWrittenFeaturesSinceLastFlush;
247 :
248 1202 : if ((m_nFeatureCount % PROGRESS_FC_INTERVAL) == 0 ||
249 1202 : m_nFeatureCount == m_nTmpFeatureCount / 2)
250 : {
251 2 : CPLDebugProgress(
252 : "PARQUET",
253 : "CopyTmpGpkgLayerToFinalFile(): %.02f%% progress",
254 2 : 100.0 * double(m_nFeatureCount) /
255 2 : double(m_nTmpFeatureCount));
256 : }
257 : }
258 : }
259 : else
260 : {
261 : // Non-leaf node: it references child nodes.
262 :
263 : // nCellCount shouldn't be over 51 normally, but even 65535
264 : // would be fine...
265 2 : assert(nCellCount <= 65535);
266 46 : for (int i = 0; i < nCellCount; ++i)
267 : {
268 : int64_t nNode;
269 44 : memcpy(&nNode, pabyNodeData + nOffset, sizeof(int64_t));
270 44 : CPL_MSBPTR64(&nNode);
271 : aNodeNoDepthPair.emplace_back(
272 44 : std::make_pair(nNode, nCurDepth + 1));
273 44 : nOffset += SIZEOF_CELL;
274 : }
275 : }
276 : }
277 :
278 3 : CPLDebug("PARQUET",
279 : "CopyTmpGpkgLayerToFinalFile(): 100%%, successfully finished");
280 3 : return true;
281 : }
282 :
283 : /************************************************************************/
284 : /* IsSupportedGeometryType() */
285 : /************************************************************************/
286 :
287 360 : bool OGRParquetWriterLayer::IsSupportedGeometryType(
288 : OGRwkbGeometryType eGType) const
289 : {
290 360 : const auto eFlattenType = wkbFlatten(eGType);
291 360 : if (!OGR_GT_HasM(eGType) && eFlattenType <= wkbGeometryCollection)
292 : {
293 359 : return true;
294 : }
295 :
296 : const auto osConfigOptionName =
297 3 : "OGR_" + GetDriverUCName() + "_ALLOW_ALL_DIMS";
298 1 : if (CPLTestBool(CPLGetConfigOption(osConfigOptionName.c_str(), "NO")))
299 : {
300 0 : return true;
301 : }
302 :
303 1 : CPLError(CE_Failure, CPLE_NotSupported,
304 : "Only 2D and Z geometry types are supported (unless the "
305 : "%s configuration option is set to YES)",
306 : osConfigOptionName.c_str());
307 1 : return false;
308 : }
309 :
310 : /************************************************************************/
311 : /* SetOptions() */
312 : /************************************************************************/
313 :
314 364 : bool OGRParquetWriterLayer::SetOptions(
315 : const OGRGeomFieldDefn *poSrcGeomFieldDefn, CSLConstList papszOptions)
316 : {
317 364 : m_aosCreationOptions = papszOptions;
318 :
319 364 : const char *pszWriteCoveringBBox = CSLFetchNameValueDef(
320 : papszOptions, "WRITE_COVERING_BBOX",
321 : CPLGetConfigOption("OGR_PARQUET_WRITE_COVERING_BBOX", nullptr));
322 364 : m_bWriteBBoxStruct =
323 364 : pszWriteCoveringBBox == nullptr || CPLTestBool(pszWriteCoveringBBox);
324 :
325 : m_oBBoxStructFieldName =
326 364 : CSLFetchNameValueDef(papszOptions, "COVERING_BBOX_NAME", "");
327 :
328 364 : if (CPLTestBool(CSLFetchNameValueDef(papszOptions, "SORT_BY_BBOX", "NO")))
329 : {
330 8 : const std::string osTmpGPKG(std::string(m_poDataset->GetDescription()) +
331 4 : ".tmp.gpkg");
332 4 : auto poGPKGDrv = GetGDALDriverManager()->GetDriverByName("GPKG");
333 4 : if (!poGPKGDrv)
334 : {
335 1 : CPLError(
336 : CE_Failure, CPLE_AppDefined,
337 : "Driver GPKG required for SORT_BY_BBOX layer creation option");
338 1 : return false;
339 : }
340 3 : m_poTmpGPKG.reset(poGPKGDrv->Create(osTmpGPKG.c_str(), 0, 0, 0,
341 : GDT_Unknown, nullptr));
342 3 : if (!m_poTmpGPKG)
343 0 : return false;
344 3 : m_poTmpGPKG->MarkSuppressOnClose();
345 3 : m_poTmpGPKGLayer = m_poTmpGPKG->CreateLayer("tmp");
346 12 : if (!m_poTmpGPKGLayer ||
347 : // Serialized feature
348 3 : m_poTmpGPKGLayer->CreateField(
349 3 : std::make_unique<OGRFieldDefn>("serialized_feature", OFTBinary)
350 3 : .get()) != OGRERR_NONE ||
351 : // FlushCache is needed to avoid SQLite3 errors on empty layers
352 9 : m_poTmpGPKG->FlushCache() != CE_None ||
353 3 : m_poTmpGPKGLayer->StartTransaction() != OGRERR_NONE)
354 : {
355 0 : return false;
356 : }
357 : }
358 :
359 : const char *pszGeomEncoding =
360 363 : CSLFetchNameValue(papszOptions, "GEOMETRY_ENCODING");
361 363 : m_eGeomEncoding = OGRArrowGeomEncoding::WKB;
362 363 : if (pszGeomEncoding)
363 : {
364 211 : if (EQUAL(pszGeomEncoding, "WKB"))
365 7 : m_eGeomEncoding = OGRArrowGeomEncoding::WKB;
366 204 : else if (EQUAL(pszGeomEncoding, "WKT"))
367 8 : m_eGeomEncoding = OGRArrowGeomEncoding::WKT;
368 196 : else if (EQUAL(pszGeomEncoding, "GEOARROW_INTERLEAVED"))
369 : {
370 28 : CPLErrorOnce(
371 : CE_Warning, CPLE_AppDefined,
372 : "Use of GEOMETRY_ENCODING=GEOARROW_INTERLEAVED is not "
373 : "recommended. "
374 : "GeoParquet 1.1 uses GEOMETRY_ENCODING=GEOARROW (struct) "
375 : "instead.");
376 28 : m_eGeomEncoding = OGRArrowGeomEncoding::GEOARROW_FSL_GENERIC;
377 : }
378 168 : else if (EQUAL(pszGeomEncoding, "GEOARROW") ||
379 0 : EQUAL(pszGeomEncoding, "GEOARROW_STRUCT"))
380 168 : m_eGeomEncoding = OGRArrowGeomEncoding::GEOARROW_STRUCT_GENERIC;
381 : else
382 : {
383 0 : CPLError(CE_Failure, CPLE_NotSupported,
384 : "Unsupported GEOMETRY_ENCODING = %s", pszGeomEncoding);
385 0 : return false;
386 : }
387 : }
388 :
389 : const char *pszCoordPrecision =
390 363 : CSLFetchNameValue(papszOptions, "COORDINATE_PRECISION");
391 363 : if (pszCoordPrecision)
392 0 : m_nWKTCoordinatePrecision = atoi(pszCoordPrecision);
393 :
394 363 : m_bForceCounterClockwiseOrientation =
395 363 : EQUAL(CSLFetchNameValueDef(papszOptions, "POLYGON_ORIENTATION",
396 : "COUNTERCLOCKWISE"),
397 : "COUNTERCLOCKWISE");
398 :
399 : const auto eGType =
400 363 : poSrcGeomFieldDefn ? poSrcGeomFieldDefn->GetType() : wkbNone;
401 363 : if (poSrcGeomFieldDefn && eGType != wkbNone)
402 : {
403 334 : if (!IsSupportedGeometryType(eGType))
404 : {
405 1 : return false;
406 : }
407 :
408 333 : m_poFeatureDefn->SetGeomType(eGType);
409 333 : auto eGeomEncoding = m_eGeomEncoding;
410 333 : if (eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_FSL_GENERIC ||
411 305 : eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_STRUCT_GENERIC)
412 : {
413 196 : const auto eEncodingType = eGeomEncoding;
414 196 : eGeomEncoding = GetPreciseArrowGeomEncoding(eEncodingType, eGType);
415 196 : if (eGeomEncoding == eEncodingType)
416 0 : return false;
417 : }
418 333 : m_aeGeomEncoding.push_back(eGeomEncoding);
419 :
420 666 : std::string osGeometryName;
421 : const char *pszGeometryName =
422 333 : CSLFetchNameValue(papszOptions, "GEOMETRY_NAME");
423 333 : if (pszGeometryName)
424 16 : osGeometryName = pszGeometryName;
425 317 : else if (poSrcGeomFieldDefn->GetNameRef()[0])
426 4 : osGeometryName = poSrcGeomFieldDefn->GetNameRef();
427 : else
428 313 : osGeometryName = "geometry";
429 333 : m_poFeatureDefn->GetGeomFieldDefn(0)->SetName(osGeometryName.c_str());
430 :
431 333 : const auto poSpatialRef = poSrcGeomFieldDefn->GetSpatialRef();
432 333 : if (poSpatialRef)
433 : {
434 42 : auto poSRS = poSpatialRef->Clone();
435 42 : m_poFeatureDefn->GetGeomFieldDefn(0)->SetSpatialRef(poSRS);
436 42 : poSRS->Release();
437 : }
438 : }
439 :
440 362 : m_osFIDColumn = CSLFetchNameValueDef(papszOptions, "FID", "");
441 :
442 : const char *pszCompression =
443 362 : CSLFetchNameValue(papszOptions, GDALMD_COMPRESSION);
444 362 : if (pszCompression == nullptr)
445 : {
446 1068 : auto oResult = arrow::util::Codec::GetCompressionType("snappy");
447 356 : if (oResult.ok() && arrow::util::Codec::IsAvailable(*oResult))
448 : {
449 356 : pszCompression = "SNAPPY";
450 : }
451 : else
452 : {
453 0 : pszCompression = "NONE";
454 : }
455 : }
456 :
457 362 : if (EQUAL(pszCompression, "NONE"))
458 0 : pszCompression = "UNCOMPRESSED";
459 : auto oResult = arrow::util::Codec::GetCompressionType(
460 724 : CPLString(pszCompression).tolower());
461 362 : if (!oResult.ok())
462 : {
463 1 : CPLError(CE_Failure, CPLE_NotSupported,
464 : "Unrecognized compression method: %s", pszCompression);
465 1 : return false;
466 : }
467 361 : m_eCompression = *oResult;
468 361 : if (!arrow::util::Codec::IsAvailable(m_eCompression))
469 : {
470 0 : CPLError(CE_Failure, CPLE_NotSupported,
471 : "Compression method %s is known, but libarrow has not "
472 : "been built with support for it",
473 : pszCompression);
474 0 : return false;
475 : }
476 361 : m_oWriterPropertiesBuilder.compression(m_eCompression);
477 :
478 : const char *pszCompressionLevel =
479 361 : CSLFetchNameValue(papszOptions, "COMPRESSION_LEVEL");
480 361 : if (pszCompressionLevel)
481 : {
482 2 : const int nCompressionLevel = atoi(pszCompressionLevel);
483 2 : if (nCompressionLevel != DEFAULT_COMPRESSION_LEVEL)
484 2 : m_oWriterPropertiesBuilder.compression_level(nCompressionLevel);
485 : }
486 359 : else if (EQUAL(pszCompression, "ZSTD"))
487 1 : m_oWriterPropertiesBuilder.compression_level(
488 : OGR_PARQUET_ZSTD_DEFAULT_COMPRESSION_LEVEL);
489 :
490 : const std::string osCreator =
491 361 : CSLFetchNameValueDef(papszOptions, "CREATOR", "");
492 361 : if (!osCreator.empty())
493 1 : m_oWriterPropertiesBuilder.created_by(osCreator);
494 : else
495 360 : m_oWriterPropertiesBuilder.created_by("GDAL " GDAL_RELEASE_NAME
496 : ", using " CREATED_BY_VERSION);
497 :
498 : // Undocumented option. Not clear it is useful besides unit test purposes
499 361 : if (!CPLTestBool(CSLFetchNameValueDef(papszOptions, "STATISTICS", "YES")))
500 1 : m_oWriterPropertiesBuilder.disable_statistics();
501 :
502 : #if PARQUET_VERSION_MAJOR >= 12
503 : // Undocumented option. Not clear it is useful to disable it.
504 361 : if (CPLTestBool(CSLFetchNameValueDef(papszOptions, "PAGE_INDEX", "YES")))
505 361 : m_oWriterPropertiesBuilder.enable_write_page_index();
506 : #endif
507 :
508 : const char *pszWriteGeo =
509 361 : CPLGetConfigOption("OGR_PARQUET_WRITE_GEO", nullptr);
510 361 : m_bWriteGeoMetadata = pszWriteGeo == nullptr || CPLTestBool(pszWriteGeo);
511 :
512 361 : if (m_eGeomEncoding == OGRArrowGeomEncoding::WKB && eGType != wkbNone)
513 : {
514 : #if ARROW_VERSION_MAJOR >= 21
515 : const char *pszUseParquetGeoTypes =
516 : CSLFetchNameValueDef(papszOptions, "USE_PARQUET_GEO_TYPES", "NO");
517 : if (EQUAL(pszUseParquetGeoTypes, "ONLY"))
518 : {
519 : m_bUseArrowWKBExtension = true;
520 : if (pszWriteGeo == nullptr)
521 : m_bWriteGeoMetadata = false;
522 : if (pszWriteCoveringBBox == nullptr)
523 : m_bWriteBBoxStruct = false;
524 : }
525 : else
526 : {
527 : m_bUseArrowWKBExtension = CPLTestBool(pszUseParquetGeoTypes);
528 : }
529 : #else
530 136 : m_oWriterPropertiesBuilder.disable_statistics(
531 408 : parquet::schema::ColumnPath::FromDotString(
532 136 : m_poFeatureDefn->GetGeomFieldDefn(0)->GetNameRef()));
533 : #endif
534 : }
535 :
536 : const char *pszRowGroupSize =
537 361 : CSLFetchNameValue(papszOptions, "ROW_GROUP_SIZE");
538 361 : if (pszRowGroupSize)
539 : {
540 19 : auto nRowGroupSize = static_cast<int64_t>(atoll(pszRowGroupSize));
541 19 : if (nRowGroupSize > 0)
542 : {
543 19 : if (nRowGroupSize > INT_MAX)
544 0 : nRowGroupSize = INT_MAX;
545 19 : m_nRowGroupSize = nRowGroupSize;
546 : }
547 : }
548 :
549 361 : m_bEdgesSpherical = EQUAL(
550 : CSLFetchNameValueDef(papszOptions, "EDGES", "PLANAR"), "SPHERICAL");
551 :
552 361 : m_bInitializationOK = true;
553 361 : return true;
554 : }
555 :
556 : /************************************************************************/
557 : /* CloseFileWriter() */
558 : /************************************************************************/
559 :
560 361 : bool OGRParquetWriterLayer::CloseFileWriter()
561 : {
562 722 : auto status = m_poFileWriter->Close();
563 361 : if (!status.ok())
564 : {
565 0 : CPLError(CE_Failure, CPLE_AppDefined,
566 : "FileWriter::Close() failed with %s",
567 0 : status.message().c_str());
568 : }
569 722 : return status.ok();
570 : }
571 :
572 : /************************************************************************/
573 : /* GetGeoMetadata() */
574 : /************************************************************************/
575 :
576 361 : std::string OGRParquetWriterLayer::GetGeoMetadata() const
577 : {
578 : // Just for unit testing purposes
579 : const char *pszGeoMetadata =
580 361 : CPLGetConfigOption("OGR_PARQUET_GEO_METADATA", nullptr);
581 361 : if (pszGeoMetadata)
582 16 : return pszGeoMetadata;
583 :
584 345 : if (m_poFeatureDefn->GetGeomFieldCount() != 0 && m_bWriteGeoMetadata)
585 : {
586 650 : CPLJSONObject oRoot;
587 325 : oRoot.Add("version", "1.1.0");
588 325 : oRoot.Add("primary_column",
589 325 : m_poFeatureDefn->GetGeomFieldDefn(0)->GetNameRef());
590 650 : CPLJSONObject oColumns;
591 325 : oRoot.Add("columns", oColumns);
592 667 : for (int i = 0; i < m_poFeatureDefn->GetGeomFieldCount(); ++i)
593 : {
594 342 : const auto poGeomFieldDefn = m_poFeatureDefn->GetGeomFieldDefn(i);
595 684 : CPLJSONObject oColumn;
596 342 : oColumns.Add(poGeomFieldDefn->GetNameRef(), oColumn);
597 342 : oColumn.Add("encoding",
598 342 : GetGeomEncodingAsString(m_aeGeomEncoding[i], true));
599 :
600 342 : if (CPLTestBool(CPLGetConfigOption("OGR_PARQUET_WRITE_CRS", "YES")))
601 : {
602 341 : const auto poSRS = poGeomFieldDefn->GetSpatialRef();
603 341 : if (poSRS)
604 : {
605 82 : OGRSpatialReference oSRSIdentified(IdentifyCRS(poSRS));
606 :
607 41 : const char *pszAuthName = oSRSIdentified.GetAuthorityName();
608 41 : const char *pszAuthCode = oSRSIdentified.GetAuthorityCode();
609 :
610 41 : bool bOmitCRS = false;
611 41 : if (pszAuthName != nullptr && pszAuthCode != nullptr &&
612 40 : ((EQUAL(pszAuthName, "EPSG") &&
613 37 : EQUAL(pszAuthCode, "4326")) ||
614 22 : (EQUAL(pszAuthName, "OGC") &&
615 3 : EQUAL(pszAuthCode, "CRS84"))))
616 : {
617 : // To make things less confusing for non-geo-aware
618 : // consumers, omit EPSG:4326 / OGC:CRS84 CRS by default
619 21 : bOmitCRS = CPLTestBool(CPLGetConfigOption(
620 : "OGR_PARQUET_CRS_OMIT_IF_WGS84", "YES"));
621 : }
622 :
623 41 : if (bOmitCRS)
624 : {
625 : // do nothing
626 : }
627 20 : else if (EQUAL(CPLGetConfigOption(
628 : "OGR_PARQUET_CRS_ENCODING", "PROJJSON"),
629 : "PROJJSON"))
630 : {
631 : // CRS encoded as PROJJSON for GeoParquet >= 0.4.0
632 20 : char *pszPROJJSON = nullptr;
633 20 : oSRSIdentified.exportToPROJJSON(&pszPROJJSON, nullptr);
634 40 : CPLJSONDocument oCRSDoc;
635 20 : CPL_IGNORE_RET_VAL(oCRSDoc.LoadMemory(pszPROJJSON));
636 20 : CPLFree(pszPROJJSON);
637 20 : CPLJSONObject oCRSRoot = oCRSDoc.GetRoot();
638 20 : RemoveIDFromMemberOfEnsembles(oCRSRoot);
639 20 : oColumn.Add("crs", oCRSRoot);
640 : }
641 : else
642 : {
643 : // WKT was used in GeoParquet <= 0.3.0
644 0 : const char *const apszOptions[] = {
645 : "FORMAT=WKT2_2019", "MULTILINE=NO", nullptr};
646 0 : char *pszWKT = nullptr;
647 0 : oSRSIdentified.exportToWkt(&pszWKT, apszOptions);
648 0 : if (pszWKT)
649 0 : oColumn.Add("crs", pszWKT);
650 0 : CPLFree(pszWKT);
651 : }
652 :
653 41 : const double dfCoordEpoch = poSRS->GetCoordinateEpoch();
654 41 : if (dfCoordEpoch > 0)
655 2 : oColumn.Add("epoch", dfCoordEpoch);
656 : }
657 : else
658 : {
659 300 : oColumn.AddNull("crs");
660 : }
661 : }
662 :
663 342 : if (m_bEdgesSpherical)
664 : {
665 3 : oColumn.Add("edges", "spherical");
666 : }
667 :
668 643 : if (m_aoEnvelopes[i].IsInit() &&
669 301 : CPLTestBool(
670 : CPLGetConfigOption("OGR_PARQUET_WRITE_BBOX", "YES")))
671 : {
672 301 : bool bHasZ = false;
673 537 : for (const auto eGeomType : m_oSetWrittenGeometryTypes[i])
674 : {
675 343 : bHasZ = CPL_TO_BOOL(OGR_GT_HasZ(eGeomType));
676 343 : if (bHasZ)
677 107 : break;
678 : }
679 301 : CPLJSONArray oBBOX;
680 301 : oBBOX.Add(m_aoEnvelopes[i].MinX);
681 301 : oBBOX.Add(m_aoEnvelopes[i].MinY);
682 301 : if (bHasZ)
683 107 : oBBOX.Add(m_aoEnvelopes[i].MinZ);
684 301 : oBBOX.Add(m_aoEnvelopes[i].MaxX);
685 301 : oBBOX.Add(m_aoEnvelopes[i].MaxY);
686 301 : if (bHasZ)
687 107 : oBBOX.Add(m_aoEnvelopes[i].MaxZ);
688 301 : oColumn.Add("bbox", oBBOX);
689 : }
690 :
691 : // Bounding box column definition
692 614 : if (m_bWriteBBoxStruct &&
693 272 : CPLTestBool(CPLGetConfigOption(
694 : "OGR_PARQUET_WRITE_COVERING_BBOX_IN_METADATA", "YES")))
695 : {
696 544 : CPLJSONObject oCovering;
697 272 : oColumn.Add("covering", oCovering);
698 544 : CPLJSONObject oBBOX;
699 272 : oCovering.Add("bbox", oBBOX);
700 : const auto AddComponent =
701 3264 : [this, i, &oBBOX](const char *pszComponent)
702 : {
703 1088 : CPLJSONArray oArray;
704 1088 : oArray.Add(m_apoFieldsBBOX[i]->name());
705 1088 : oArray.Add(pszComponent);
706 1088 : oBBOX.Add(pszComponent, oArray);
707 1088 : };
708 272 : AddComponent("xmin");
709 272 : AddComponent("ymin");
710 272 : AddComponent("xmax");
711 272 : AddComponent("ymax");
712 : }
713 :
714 359 : const auto GetStringGeometryType = [](OGRwkbGeometryType eType)
715 : {
716 359 : const auto eFlattenType = wkbFlatten(eType);
717 359 : std::string osType = "Unknown";
718 359 : if (wkbPoint == eFlattenType)
719 85 : osType = "Point";
720 274 : else if (wkbLineString == eFlattenType)
721 42 : osType = "LineString";
722 232 : else if (wkbPolygon == eFlattenType)
723 70 : osType = "Polygon";
724 162 : else if (wkbMultiPoint == eFlattenType)
725 34 : osType = "MultiPoint";
726 128 : else if (wkbMultiLineString == eFlattenType)
727 37 : osType = "MultiLineString";
728 91 : else if (wkbMultiPolygon == eFlattenType)
729 86 : osType = "MultiPolygon";
730 5 : else if (wkbGeometryCollection == eFlattenType)
731 5 : osType = "GeometryCollection";
732 359 : if (osType != "Unknown")
733 : {
734 : // M and ZM not supported officially currently, but it
735 : // doesn't hurt to anticipate
736 359 : if (OGR_GT_HasZ(eType) && OGR_GT_HasM(eType))
737 8 : osType += " ZM";
738 351 : else if (OGR_GT_HasZ(eType))
739 115 : osType += " Z";
740 236 : else if (OGR_GT_HasM(eType))
741 8 : osType += " M";
742 : }
743 359 : return osType;
744 : };
745 :
746 342 : if (m_bForceCounterClockwiseOrientation)
747 341 : oColumn.Add("orientation", "counterclockwise");
748 :
749 342 : CPLJSONArray oArray;
750 701 : for (const auto eType : m_oSetWrittenGeometryTypes[i])
751 : {
752 359 : oArray.Add(GetStringGeometryType(eType));
753 : }
754 342 : oColumn.Add("geometry_types", oArray);
755 : }
756 :
757 325 : return oRoot.Format(CPLJSONObject::PrettyFormat::Plain);
758 : }
759 20 : return std::string();
760 : }
761 :
762 : /************************************************************************/
763 : /* PerformStepsBeforeFinalFlushGroup() */
764 : /************************************************************************/
765 :
766 361 : void OGRParquetWriterLayer::PerformStepsBeforeFinalFlushGroup()
767 : {
768 361 : if (m_poKeyValueMetadata)
769 : {
770 722 : std::string osGeoMetadata = GetGeoMetadata();
771 722 : auto poTmpSchema = m_poSchema;
772 361 : if (!osGeoMetadata.empty())
773 : {
774 : // HACK: it would be good for Arrow to provide a clean way to alter
775 : // key value metadata before finalizing.
776 : // We need to write metadata at end to write the bounding box.
777 341 : const_cast<arrow::KeyValueMetadata *>(m_poKeyValueMetadata.get())
778 341 : ->Append("geo", osGeoMetadata);
779 :
780 341 : auto kvMetadata = poTmpSchema->metadata()
781 15 : ? poTmpSchema->metadata()->Copy()
782 356 : : std::make_shared<arrow::KeyValueMetadata>();
783 341 : kvMetadata->Append("geo", std::move(osGeoMetadata));
784 341 : poTmpSchema = poTmpSchema->WithMetadata(kvMetadata);
785 : }
786 :
787 361 : if (CPLTestBool(
788 : CPLGetConfigOption("OGR_PARQUET_WRITE_ARROW_SCHEMA", "YES")))
789 : {
790 : auto status =
791 722 : ::arrow::ipc::SerializeSchema(*poTmpSchema, m_poMemoryPool);
792 361 : if (status.ok())
793 : {
794 : // The serialized schema is not UTF-8, which is required for
795 : // Thrift
796 722 : const std::string schema_as_string = (*status)->ToString();
797 : std::string schema_base64 =
798 361 : ::arrow::util::base64_encode(schema_as_string);
799 361 : static const std::string kArrowSchemaKey = "ARROW:schema";
800 : const_cast<arrow::KeyValueMetadata *>(
801 361 : m_poKeyValueMetadata.get())
802 361 : ->Append(kArrowSchemaKey, std::move(schema_base64));
803 : }
804 : }
805 :
806 : // Put GDAL metadata into a gdal:metadata domain
807 722 : CPLJSONObject oMultiMetadata;
808 361 : bool bHasMultiMetadata = false;
809 369 : auto &l_oMDMD = oMDMD.GetDomainList() && *(oMDMD.GetDomainList())
810 369 : ? oMDMD
811 353 : : m_poDataset->GetMultiDomainMetadata();
812 371 : for (CSLConstList papszDomainIter = l_oMDMD.GetDomainList();
813 371 : papszDomainIter && *papszDomainIter; ++papszDomainIter)
814 : {
815 10 : const char *pszDomain = *papszDomainIter;
816 10 : CSLConstList papszMD = l_oMDMD.GetMetadata(pszDomain);
817 10 : if (STARTS_WITH(pszDomain, "json:") && papszMD && papszMD[0])
818 : {
819 1 : CPLJSONDocument oDoc;
820 1 : if (oDoc.LoadMemory(papszMD[0]))
821 : {
822 1 : bHasMultiMetadata = true;
823 1 : oMultiMetadata.Add(pszDomain, oDoc.GetRoot());
824 1 : continue;
825 0 : }
826 : }
827 9 : else if (STARTS_WITH(pszDomain, "xml:") && papszMD && papszMD[0])
828 : {
829 1 : bHasMultiMetadata = true;
830 1 : oMultiMetadata.Add(pszDomain, papszMD[0]);
831 1 : continue;
832 : }
833 16 : CPLJSONObject oMetadata;
834 8 : bool bHasMetadata = false;
835 16 : for (CSLConstList papszMDIter = papszMD;
836 16 : papszMDIter && *papszMDIter; ++papszMDIter)
837 : {
838 8 : char *pszKey = nullptr;
839 8 : const char *pszValue = CPLParseNameValue(*papszMDIter, &pszKey);
840 8 : if (pszKey && pszValue)
841 : {
842 8 : bHasMetadata = true;
843 8 : bHasMultiMetadata = true;
844 8 : oMetadata.Add(pszKey, pszValue);
845 : }
846 8 : CPLFree(pszKey);
847 : }
848 8 : if (bHasMetadata)
849 8 : oMultiMetadata.Add(pszDomain, oMetadata);
850 : }
851 361 : if (bHasMultiMetadata)
852 : {
853 8 : const_cast<arrow::KeyValueMetadata *>(m_poKeyValueMetadata.get())
854 8 : ->Append(
855 : "gdal:metadata",
856 16 : oMultiMetadata.Format(CPLJSONObject::PrettyFormat::Plain));
857 : }
858 :
859 361 : if (!m_aosCreationOptions.empty())
860 : {
861 508 : CPLJSONObject oCreationOptions;
862 254 : bool bEmpty = true;
863 1120 : for (const auto &[key, value] :
864 1374 : cpl::IterateNameValue(m_aosCreationOptions))
865 : {
866 560 : if (!EQUAL(key, "FID") && !EQUAL(key, "GEOMETRY_NAME") &&
867 521 : !EQUAL(key, "EDGES"))
868 : {
869 517 : bEmpty = false;
870 517 : oCreationOptions.Add(key, value);
871 : }
872 : }
873 254 : if (!bEmpty)
874 : {
875 : const_cast<arrow::KeyValueMetadata *>(
876 240 : m_poKeyValueMetadata.get())
877 240 : ->Append("gdal:creation-options",
878 480 : oCreationOptions.Format(
879 : CPLJSONObject::PrettyFormat::Plain));
880 : }
881 : }
882 : }
883 361 : }
884 :
885 : /************************************************************************/
886 : /* Open() */
887 : /************************************************************************/
888 :
889 : // Same as parquet::arrow::FileWriter::Open(), except we also
890 : // return KeyValueMetadata
891 : static arrow::Status
892 361 : Open(const ::arrow::Schema &schema, ::arrow::MemoryPool *pool,
893 : std::shared_ptr<::arrow::io::OutputStream> sink,
894 : std::shared_ptr<parquet::WriterProperties> properties,
895 : std::shared_ptr<parquet::ArrowWriterProperties> arrow_properties,
896 : std::unique_ptr<parquet::arrow::FileWriter> *writer,
897 : std::shared_ptr<const arrow::KeyValueMetadata> *outMetadata)
898 : {
899 361 : std::shared_ptr<parquet::SchemaDescriptor> parquet_schema;
900 722 : RETURN_NOT_OK(parquet::arrow::ToParquetSchema(
901 : &schema, *properties, *arrow_properties, &parquet_schema));
902 :
903 : auto schema_node = std::static_pointer_cast<parquet::schema::GroupNode>(
904 722 : parquet_schema->schema_root());
905 :
906 361 : auto metadata = schema.metadata()
907 20 : ? schema.metadata()->Copy()
908 742 : : std::make_shared<arrow::KeyValueMetadata>();
909 361 : *outMetadata = metadata;
910 :
911 361 : std::unique_ptr<parquet::ParquetFileWriter> base_writer;
912 361 : PARQUET_CATCH_NOT_OK(base_writer = parquet::ParquetFileWriter::Open(
913 : std::move(sink), std::move(schema_node),
914 : std::move(properties), metadata));
915 :
916 361 : auto schema_ptr = std::make_shared<::arrow::Schema>(schema);
917 : return parquet::arrow::FileWriter::Make(
918 722 : pool, std::move(base_writer), std::move(schema_ptr),
919 1083 : std::move(arrow_properties), writer);
920 : }
921 :
922 : /************************************************************************/
923 : /* CreateSchema() */
924 : /************************************************************************/
925 :
926 361 : void OGRParquetWriterLayer::CreateSchema()
927 : {
928 361 : CreateSchemaCommon();
929 361 : }
930 :
931 : /************************************************************************/
932 : /* CreateGeomField() */
933 : /************************************************************************/
934 :
935 27 : OGRErr OGRParquetWriterLayer::CreateGeomField(const OGRGeomFieldDefn *poField,
936 : int bApproxOK)
937 : {
938 27 : OGRErr eErr = OGRArrowWriterLayer::CreateGeomField(poField, bApproxOK);
939 53 : if (eErr == OGRERR_NONE &&
940 26 : m_aeGeomEncoding.back() == OGRArrowGeomEncoding::WKB
941 : #if ARROW_VERSION_MAJOR < 21
942 : // Geostatistics in Arrow 21 do not support geographic type for now
943 53 : && m_bEdgesSpherical
944 : #endif
945 : )
946 : {
947 0 : m_oWriterPropertiesBuilder.disable_statistics(
948 0 : parquet::schema::ColumnPath::FromDotString(
949 0 : m_poFeatureDefn
950 0 : ->GetGeomFieldDefn(m_poFeatureDefn->GetGeomFieldCount() - 1)
951 : ->GetNameRef()));
952 : }
953 27 : return eErr;
954 : }
955 :
956 : /************************************************************************/
957 : /* CreateWriter() */
958 : /************************************************************************/
959 :
960 361 : void OGRParquetWriterLayer::CreateWriter()
961 : {
962 361 : CPLAssert(m_poFileWriter == nullptr);
963 :
964 361 : if (m_poSchema == nullptr)
965 : {
966 43 : CreateSchema();
967 : }
968 : else
969 : {
970 318 : FinalizeSchema();
971 : }
972 :
973 : auto arrowWriterProperties =
974 361 : parquet::ArrowWriterProperties::Builder().store_schema()->build();
975 1083 : CPL_IGNORE_RET_VAL(Open(*m_poSchema, m_poMemoryPool, m_poOutputStream,
976 722 : m_oWriterPropertiesBuilder.build(),
977 361 : std::move(arrowWriterProperties), &m_poFileWriter,
978 : &m_poKeyValueMetadata));
979 361 : }
980 :
981 : /************************************************************************/
982 : /* ICreateFeature() */
983 : /************************************************************************/
984 :
985 3390 : OGRErr OGRParquetWriterLayer::ICreateFeature(OGRFeature *poFeature)
986 : {
987 : // If not using SORT_BY_BBOX=YES layer creation option, we can directly
988 : // write features to the final Parquet file
989 3390 : if (!m_poTmpGPKGLayer)
990 1186 : return OGRArrowWriterLayer::ICreateFeature(poFeature);
991 :
992 : // SORT_BY_BBOX=YES case: we write for now a serialized version of poFeature
993 : // in a temporary GeoPackage file.
994 :
995 2204 : GIntBig nFID = poFeature->GetFID();
996 2204 : if (!m_osFIDColumn.empty() && nFID == OGRNullFID)
997 : {
998 1102 : nFID = m_nTmpFeatureCount;
999 1102 : poFeature->SetFID(nFID);
1000 : }
1001 2204 : ++m_nTmpFeatureCount;
1002 :
1003 4408 : std::vector<GByte> abyBuffer;
1004 : // Serialize the source feature as a single array of bytes to preserve it
1005 : // fully
1006 2204 : if (!poFeature->SerializeToBinary(abyBuffer))
1007 : {
1008 0 : return OGRERR_FAILURE;
1009 : }
1010 :
1011 : // SQLite3 limitation: a row must fit in slightly less than 1 GB.
1012 2204 : constexpr int SOME_MARGIN = 128;
1013 2204 : if (abyBuffer.size() > 1024 * 1024 * 1024 - SOME_MARGIN)
1014 : {
1015 0 : CPLError(CE_Failure, CPLE_NotSupported,
1016 : "Features larger than 1 GB are not supported");
1017 0 : return OGRERR_FAILURE;
1018 : }
1019 :
1020 4408 : OGRFeature oFeat(m_poTmpGPKGLayer->GetLayerDefn());
1021 2204 : oFeat.SetFID(nFID);
1022 2204 : oFeat.SetField(0, static_cast<int>(abyBuffer.size()), abyBuffer.data());
1023 2204 : const auto poSrcGeom = poFeature->GetGeometryRef();
1024 2204 : if (poSrcGeom && !poSrcGeom->IsEmpty())
1025 : {
1026 : // For the purpose of building an RTree, just use the bounding box of
1027 : // the geometry as the geometry.
1028 1202 : OGREnvelope sEnvelope;
1029 1202 : poSrcGeom->getEnvelope(&sEnvelope);
1030 2404 : auto poPoly = std::make_unique<OGRPolygon>();
1031 2404 : auto poLR = std::make_unique<OGRLinearRing>();
1032 1202 : poLR->addPoint(sEnvelope.MinX, sEnvelope.MinY);
1033 1202 : poLR->addPoint(sEnvelope.MinX, sEnvelope.MaxY);
1034 1202 : poLR->addPoint(sEnvelope.MaxX, sEnvelope.MaxY);
1035 1202 : poLR->addPoint(sEnvelope.MaxX, sEnvelope.MinY);
1036 1202 : poLR->addPoint(sEnvelope.MinX, sEnvelope.MinY);
1037 1202 : poPoly->addRingDirectly(poLR.release());
1038 1202 : oFeat.SetGeometryDirectly(poPoly.release());
1039 : }
1040 2204 : return m_poTmpGPKGLayer->CreateFeature(&oFeat);
1041 : }
1042 :
1043 : /************************************************************************/
1044 : /* FlushGroup() */
1045 : /************************************************************************/
1046 :
1047 333 : bool OGRParquetWriterLayer::FlushGroup()
1048 : {
1049 : #if PARQUET_VERSION_MAJOR >= 20
1050 : auto status = m_poFileWriter->NewRowGroup();
1051 : #else
1052 666 : auto status = m_poFileWriter->NewRowGroup(m_apoBuilders[0]->length());
1053 : #endif
1054 333 : if (!status.ok())
1055 : {
1056 0 : CPLError(CE_Failure, CPLE_AppDefined, "NewRowGroup() failed with %s",
1057 0 : status.message().c_str());
1058 0 : ClearArrayBuilers();
1059 0 : return false;
1060 : }
1061 :
1062 333 : auto ret = WriteArrays(
1063 1291 : [this](const std::shared_ptr<arrow::Field> &field,
1064 1291 : const std::shared_ptr<arrow::Array> &array)
1065 : {
1066 2582 : auto l_status = m_poFileWriter->WriteColumnChunk(*array);
1067 1291 : if (!l_status.ok())
1068 : {
1069 0 : CPLError(CE_Failure, CPLE_AppDefined,
1070 : "WriteColumnChunk() failed for field %s: %s",
1071 0 : field->name().c_str(), l_status.message().c_str());
1072 0 : return false;
1073 : }
1074 1291 : return true;
1075 : });
1076 :
1077 333 : ClearArrayBuilers();
1078 333 : return ret;
1079 : }
1080 :
1081 : /************************************************************************/
1082 : /* FixupWKBGeometryBeforeWriting() */
1083 : /************************************************************************/
1084 :
1085 51 : void OGRParquetWriterLayer::FixupWKBGeometryBeforeWriting(GByte *pabyWkb,
1086 : size_t nLen)
1087 : {
1088 51 : if (!m_bForceCounterClockwiseOrientation)
1089 0 : return;
1090 :
1091 51 : OGRWKBFixupCounterClockWiseExternalRing(pabyWkb, nLen);
1092 : }
1093 :
1094 : /************************************************************************/
1095 : /* FixupGeometryBeforeWriting() */
1096 : /************************************************************************/
1097 :
1098 1429 : void OGRParquetWriterLayer::FixupGeometryBeforeWriting(OGRGeometry *poGeom)
1099 : {
1100 1429 : if (!m_bForceCounterClockwiseOrientation)
1101 3 : return;
1102 :
1103 1426 : const auto eFlattenType = wkbFlatten(poGeom->getGeometryType());
1104 : // Polygon rings MUST follow the right-hand rule for orientation
1105 : // (counterclockwise external rings, clockwise internal rings)
1106 1426 : if (eFlattenType == wkbPolygon)
1107 : {
1108 74 : bool bFirstRing = true;
1109 151 : for (auto poRing : poGeom->toPolygon())
1110 : {
1111 85 : if ((bFirstRing && poRing->isClockwise()) ||
1112 8 : (!bFirstRing && !poRing->isClockwise()))
1113 : {
1114 72 : poRing->reversePoints();
1115 : }
1116 77 : bFirstRing = false;
1117 : }
1118 : }
1119 1352 : else if (eFlattenType == wkbMultiPolygon ||
1120 : eFlattenType == wkbGeometryCollection)
1121 : {
1122 135 : for (auto poSubGeom : poGeom->toGeometryCollection())
1123 : {
1124 71 : FixupGeometryBeforeWriting(poSubGeom);
1125 : }
1126 : }
1127 : }
1128 :
1129 : /************************************************************************/
1130 : /* WriteArrowBatch() */
1131 : /************************************************************************/
1132 :
1133 : #if PARQUET_VERSION_MAJOR > 10
1134 : inline bool
1135 25 : OGRParquetWriterLayer::WriteArrowBatch(const struct ArrowSchema *schema,
1136 : struct ArrowArray *array,
1137 : CSLConstList papszOptions)
1138 : {
1139 25 : if (m_poTmpGPKGLayer)
1140 : {
1141 : // When using SORT_BY_BBOX=YES option, we can't directly write the
1142 : // input array, because we need to sort features. Hence we fallback
1143 : // to the OGRLayer base implementation, which will ultimately call
1144 : // OGRParquetWriterLayer::ICreateFeature()
1145 0 : return OGRLayer::WriteArrowBatch(schema, array, papszOptions);
1146 : }
1147 :
1148 50 : return WriteArrowBatchInternal(
1149 : schema, array, papszOptions,
1150 50 : [this](const std::shared_ptr<arrow::RecordBatch> &poBatch)
1151 : {
1152 50 : auto status = m_poFileWriter->NewBufferedRowGroup();
1153 25 : if (!status.ok())
1154 : {
1155 0 : CPLError(CE_Failure, CPLE_AppDefined,
1156 : "NewBufferedRowGroup() failed with %s",
1157 0 : status.message().c_str());
1158 0 : return false;
1159 : }
1160 :
1161 25 : status = m_poFileWriter->WriteRecordBatch(*poBatch);
1162 25 : if (!status.ok())
1163 : {
1164 0 : CPLError(CE_Failure, CPLE_AppDefined,
1165 : "WriteRecordBatch() failed: %s",
1166 0 : status.message().c_str());
1167 0 : return false;
1168 : }
1169 :
1170 25 : return true;
1171 25 : });
1172 : }
1173 : #endif
1174 :
1175 : /************************************************************************/
1176 : /* TestCapability() */
1177 : /************************************************************************/
1178 :
1179 694 : inline int OGRParquetWriterLayer::TestCapability(const char *pszCap) const
1180 : {
1181 : #if PARQUET_VERSION_MAJOR <= 10
1182 : if (EQUAL(pszCap, OLCFastWriteArrowBatch))
1183 : return false;
1184 : #endif
1185 :
1186 694 : if (m_poTmpGPKGLayer && EQUAL(pszCap, OLCFastWriteArrowBatch))
1187 : {
1188 : // When using SORT_BY_BBOX=YES option, we can't directly write the
1189 : // input array, because we need to sort features. So this is not
1190 : // fast
1191 1 : return false;
1192 : }
1193 :
1194 693 : return OGRArrowWriterLayer::TestCapability(pszCap);
1195 : }
1196 :
1197 : /************************************************************************/
1198 : /* CreateFieldFromArrowSchema() */
1199 : /************************************************************************/
1200 :
1201 : #if PARQUET_VERSION_MAJOR > 10
1202 479 : bool OGRParquetWriterLayer::CreateFieldFromArrowSchema(
1203 : const struct ArrowSchema *schema, CSLConstList papszOptions)
1204 : {
1205 479 : if (m_poTmpGPKGLayer)
1206 : {
1207 : // When using SORT_BY_BBOX=YES option, we can't directly write the
1208 : // input array, because we need to sort features. But this process
1209 : // only supports the base Arrow types supported by
1210 : // OGRLayer::WriteArrowBatch()
1211 0 : return OGRLayer::CreateFieldFromArrowSchema(schema, papszOptions);
1212 : }
1213 :
1214 479 : return OGRArrowWriterLayer::CreateFieldFromArrowSchema(schema,
1215 479 : papszOptions);
1216 : }
1217 : #endif
1218 :
1219 : /************************************************************************/
1220 : /* IsArrowSchemaSupported() */
1221 : /************************************************************************/
1222 :
1223 : #if PARQUET_VERSION_MAJOR > 10
1224 1095 : bool OGRParquetWriterLayer::IsArrowSchemaSupported(
1225 : const struct ArrowSchema *schema, CSLConstList papszOptions,
1226 : std::string &osErrorMsg) const
1227 : {
1228 1095 : if (m_poTmpGPKGLayer)
1229 : {
1230 : // When using SORT_BY_BBOX=YES option, we can't directly write the
1231 : // input array, because we need to sort features. But this process
1232 : // only supports the base Arrow types supported by
1233 : // OGRLayer::WriteArrowBatch()
1234 0 : return OGRLayer::IsArrowSchemaSupported(schema, papszOptions,
1235 0 : osErrorMsg);
1236 : }
1237 :
1238 1095 : if (schema->format[0] == 'e' && schema->format[1] == 0)
1239 : {
1240 1 : osErrorMsg = "float16 not supported";
1241 1 : return false;
1242 : }
1243 1094 : if (schema->format[0] == 'v' && schema->format[1] == 'u')
1244 : {
1245 1 : osErrorMsg = "StringView not supported";
1246 1 : return false;
1247 : }
1248 1093 : if (schema->format[0] == 'v' && schema->format[1] == 'z')
1249 : {
1250 1 : osErrorMsg = "BinaryView not supported";
1251 1 : return false;
1252 : }
1253 1092 : if (schema->format[0] == '+' && schema->format[1] == 'v')
1254 : {
1255 0 : if (schema->format[2] == 'l')
1256 : {
1257 0 : osErrorMsg = "ListView not supported";
1258 0 : return false;
1259 : }
1260 0 : else if (schema->format[2] == 'L')
1261 : {
1262 0 : osErrorMsg = "LargeListView not supported";
1263 0 : return false;
1264 : }
1265 : }
1266 2169 : for (int64_t i = 0; i < schema->n_children; ++i)
1267 : {
1268 1080 : if (!IsArrowSchemaSupported(schema->children[i], papszOptions,
1269 : osErrorMsg))
1270 : {
1271 3 : return false;
1272 : }
1273 : }
1274 1089 : return true;
1275 : }
1276 : #endif
1277 :
1278 : /************************************************************************/
1279 : /* SetMetadata() */
1280 : /************************************************************************/
1281 :
1282 13 : CPLErr OGRParquetWriterLayer::SetMetadata(CSLConstList papszMetadata,
1283 : const char *pszDomain)
1284 : {
1285 13 : if (!pszDomain || !EQUAL(pszDomain, "SHAPEFILE"))
1286 : {
1287 8 : return OGRLayer::SetMetadata(papszMetadata, pszDomain);
1288 : }
1289 5 : return CE_None;
1290 : }
1291 :
1292 : /************************************************************************/
1293 : /* GetDataset() */
1294 : /************************************************************************/
1295 :
1296 26 : GDALDataset *OGRParquetWriterLayer::GetDataset()
1297 : {
1298 26 : return m_poDataset;
1299 : }
|