Line data Source code
1 : /******************************************************************************
2 : *
3 : * Project: Parquet Translator
4 : * Purpose: Implements OGRParquetDriver.
5 : * Author: Even Rouault, <even.rouault at spatialys.com>
6 : *
7 : ******************************************************************************
8 : * Copyright (c) 2022, Planet Labs
9 : *
10 : * SPDX-License-Identifier: MIT
11 : ****************************************************************************/
12 :
13 : #include "gdal_pam.h"
14 : #include "ogrsf_frmts.h"
15 :
16 : #include <algorithm>
17 : #include <map>
18 : #include <tuple>
19 :
20 : #include "ogr_parquet.h"
21 : #include "ogrparquetdrivercore.h"
22 :
23 : #include "../arrow_common/ograrrowrandomaccessfile.h"
24 : #include "../arrow_common/vsiarrowfilesystem.hpp"
25 : #include "../arrow_common/ograrrowwritablefile.h"
26 : #include "../arrow_common/ograrrowdataset.hpp"
27 : #include "../arrow_common/ograrrowlayer.hpp" // for the destructor
28 :
29 : #ifdef GDAL_USE_ARROWDATASET
30 :
31 : /************************************************************************/
32 : /* OpenFromDatasetFactory() */
33 : /************************************************************************/
34 :
35 273 : static GDALDataset *OpenFromDatasetFactory(
36 : const std::string &osBasePath,
37 : const std::shared_ptr<arrow::dataset::DatasetFactory> &factory,
38 : CSLConstList papszOpenOptions,
39 : const std::shared_ptr<arrow::fs::FileSystem> &fs)
40 : {
41 273 : std::shared_ptr<arrow::dataset::Dataset> dataset;
42 546 : PARQUET_ASSIGN_OR_THROW(dataset, factory->Finish());
43 :
44 : auto poMemoryPool = std::shared_ptr<arrow::MemoryPool>(
45 546 : arrow::MemoryPool::CreateDefault().release());
46 :
47 273 : const bool bIsVSI = STARTS_WITH(osBasePath.c_str(), "/vsi");
48 546 : auto poDS = std::make_unique<OGRParquetDataset>(poMemoryPool);
49 : auto poLayer = std::make_unique<OGRParquetDatasetLayer>(
50 546 : poDS.get(), CPLGetBasenameSafe(osBasePath.c_str()).c_str(), bIsVSI,
51 546 : dataset, papszOpenOptions);
52 273 : poDS->SetLayer(std::move(poLayer));
53 273 : poDS->SetFileSystem(fs);
54 546 : return poDS.release();
55 : }
56 :
57 : /************************************************************************/
58 : /* GetFileSystem() */
59 : /************************************************************************/
60 :
61 : static std::tuple<std::shared_ptr<arrow::fs::FileSystem>, std::string>
62 273 : GetFileSystem(std::string &osBasePathInOut,
63 : const std::string &osQueryParameters)
64 : {
65 : // Instantiate file system:
66 : // - VSIArrowFileSystem implementation for /vsi files
67 : // - base implementation for local files (if OGR_PARQUET_USE_VSI set to NO)
68 273 : std::shared_ptr<arrow::fs::FileSystem> fs;
69 273 : const bool bIsVSI = STARTS_WITH(osBasePathInOut.c_str(), "/vsi");
70 : VSIStatBufL sStat;
71 546 : std::string osFSFilename;
72 459 : if ((bIsVSI ||
73 538 : CPLTestBool(CPLGetConfigOption("OGR_PARQUET_USE_VSI", "YES"))) &&
74 265 : VSIStatL(osBasePathInOut.c_str(), &sStat) == 0)
75 : {
76 264 : osFSFilename = osBasePathInOut;
77 264 : fs = std::make_shared<VSIArrowFileSystem>("PARQUET", osQueryParameters);
78 : }
79 : else
80 : {
81 : // FileSystemFromUriOrPath() doesn't like relative paths
82 : // so transform them to absolute.
83 9 : std::string osPath(osBasePathInOut);
84 9 : if (CPLIsFilenameRelative(osPath.c_str()))
85 : {
86 8 : char *pszCurDir = CPLGetCurrentDir();
87 8 : if (pszCurDir == nullptr)
88 0 : return {nullptr, osFSFilename};
89 8 : osPath = CPLFormFilenameSafe(pszCurDir, osPath.c_str(), nullptr);
90 8 : CPLFree(pszCurDir);
91 : }
92 9 : PARQUET_ASSIGN_OR_THROW(
93 : fs, arrow::fs::FileSystemFromUriOrPath(osPath, &osFSFilename));
94 : }
95 273 : return {fs, osFSFilename};
96 : }
97 :
98 : /************************************************************************/
99 : /* OpenParquetDatasetWithMetadata() */
100 : /************************************************************************/
101 :
102 18 : static GDALDataset *OpenParquetDatasetWithMetadata(
103 : const std::string &osBasePathIn, const char *pszMetadataFile,
104 : const std::string &osQueryParameters, CSLConstList papszOpenOptions)
105 : {
106 36 : std::string osBasePath(osBasePathIn);
107 18 : const auto &[fs, osFSFilename] =
108 36 : GetFileSystem(osBasePath, osQueryParameters);
109 :
110 36 : arrow::dataset::ParquetFactoryOptions options;
111 36 : auto partitioningFactory = arrow::dataset::HivePartitioning::MakeFactory();
112 : options.partitioning =
113 18 : arrow::dataset::PartitioningOrFactory(std::move(partitioningFactory));
114 :
115 18 : std::shared_ptr<arrow::dataset::DatasetFactory> factory;
116 : // coverity[copy_constructor_call]
117 54 : PARQUET_ASSIGN_OR_THROW(
118 : factory, arrow::dataset::ParquetDatasetFactory::Make(
119 : osFSFilename + '/' + pszMetadataFile, fs,
120 : std::make_shared<arrow::dataset::ParquetFileFormat>(),
121 : std::move(options)));
122 :
123 36 : return OpenFromDatasetFactory(osBasePath, factory, papszOpenOptions, fs);
124 : }
125 :
126 : /************************************************************************/
127 : /* OpenParquetDatasetWithoutMetadata() */
128 : /************************************************************************/
129 :
130 : static GDALDataset *
131 255 : OpenParquetDatasetWithoutMetadata(const std::string &osBasePathIn,
132 : const std::string &osQueryParameters,
133 : CSLConstList papszOpenOptions)
134 : {
135 510 : std::string osBasePath(osBasePathIn);
136 255 : const auto &[fs, osFSFilename] =
137 510 : GetFileSystem(osBasePath, osQueryParameters);
138 :
139 510 : arrow::dataset::FileSystemFactoryOptions options;
140 255 : std::shared_ptr<arrow::dataset::DatasetFactory> factory;
141 :
142 510 : const auto fileInfo = fs->GetFileInfo(osFSFilename);
143 255 : if (fileInfo->IsFile())
144 : {
145 : // coverity[copy_constructor_call]
146 1008 : PARQUET_ASSIGN_OR_THROW(
147 : factory, arrow::dataset::FileSystemDatasetFactory::Make(
148 : fs, {std::move(osFSFilename)},
149 : std::make_shared<arrow::dataset::ParquetFileFormat>(),
150 : std::move(options)));
151 : }
152 : else
153 : {
154 : auto partitioningFactory =
155 6 : arrow::dataset::HivePartitioning::MakeFactory();
156 6 : options.partitioning = arrow::dataset::PartitioningOrFactory(
157 6 : std::move(partitioningFactory));
158 :
159 6 : arrow::fs::FileSelector selector;
160 3 : selector.base_dir = std::move(osFSFilename);
161 3 : selector.recursive = true;
162 :
163 : // coverity[copy_constructor_call]
164 6 : PARQUET_ASSIGN_OR_THROW(
165 : factory, arrow::dataset::FileSystemDatasetFactory::Make(
166 : fs, std::move(selector),
167 : std::make_shared<arrow::dataset::ParquetFileFormat>(),
168 : std::move(options)));
169 : }
170 :
171 510 : return OpenFromDatasetFactory(osBasePath, factory, papszOpenOptions, fs);
172 : }
173 :
174 : #endif
175 :
176 : /************************************************************************/
177 : /* BuildMemDatasetWithRowGroupExtents() */
178 : /************************************************************************/
179 :
180 : /** Builds a Memory dataset that contains, for each row-group of the input file,
181 : * the feature count and spatial extent of the features of this row group,
182 : * using Parquet statistics. This assumes that the Parquet file declares
183 : * a "covering":{"bbox":{ ... }} metadata item.
184 : *
185 : * Only for debug purposes.
186 : */
187 1 : static GDALDataset *BuildMemDatasetWithRowGroupExtents(OGRParquetLayer *poLayer)
188 : {
189 1 : int iParquetXMin = -1;
190 1 : int iParquetYMin = -1;
191 1 : int iParquetXMax = -1;
192 1 : int iParquetYMax = -1;
193 1 : if (poLayer->GeomColsBBOXParquet(0, iParquetXMin, iParquetYMin,
194 : iParquetXMax, iParquetYMax))
195 : {
196 1 : auto poMemDrv = GetGDALDriverManager()->GetDriverByName("Memory");
197 1 : if (!poMemDrv)
198 0 : return nullptr;
199 : auto poMemDS = std::unique_ptr<GDALDataset>(
200 2 : poMemDrv->Create("", 0, 0, 0, GDT_Unknown, nullptr));
201 1 : if (!poMemDS)
202 0 : return nullptr;
203 1 : OGRSpatialReference *poTmpSRS = nullptr;
204 1 : const auto poSrcSRS = poLayer->GetSpatialRef();
205 1 : if (poSrcSRS)
206 0 : poTmpSRS = poSrcSRS->Clone();
207 : auto poMemLayer =
208 1 : poMemDS->CreateLayer("footprint", poTmpSRS, wkbPolygon, nullptr);
209 1 : if (poTmpSRS)
210 0 : poTmpSRS->Release();
211 1 : if (!poMemLayer)
212 0 : return nullptr;
213 1 : poMemLayer->CreateField(
214 1 : std::make_unique<OGRFieldDefn>("feature_count", OFTInteger64)
215 1 : .get());
216 :
217 : const auto metadata =
218 2 : poLayer->GetReader()->parquet_reader()->metadata();
219 1 : const int numRowGroups = metadata->num_row_groups();
220 15 : for (int iRowGroup = 0; iRowGroup < numRowGroups; ++iRowGroup)
221 : {
222 28 : std::string osMinTmp, osMaxTmp;
223 : OGRField unusedF;
224 : bool unusedB;
225 : OGRFieldSubType unusedSubType;
226 :
227 : OGRField sXMin;
228 14 : OGR_RawField_SetNull(&sXMin);
229 14 : bool bFoundXMin = false;
230 14 : OGRFieldType eXMinType = OFTMaxType;
231 :
232 : OGRField sYMin;
233 14 : OGR_RawField_SetNull(&sYMin);
234 14 : bool bFoundYMin = false;
235 14 : OGRFieldType eYMinType = OFTMaxType;
236 :
237 : OGRField sXMax;
238 14 : OGR_RawField_SetNull(&sXMax);
239 14 : bool bFoundXMax = false;
240 14 : OGRFieldType eXMaxType = OFTMaxType;
241 :
242 : OGRField sYMax;
243 14 : OGR_RawField_SetNull(&sYMax);
244 14 : bool bFoundYMax = false;
245 14 : OGRFieldType eYMaxType = OFTMaxType;
246 :
247 14 : if (poLayer->GetMinMaxForParquetCol(
248 : iRowGroup, iParquetXMin, nullptr,
249 : /* bComputeMin = */ true, sXMin, bFoundXMin,
250 : /* bComputeMax = */ false, unusedF, unusedB, eXMinType,
251 8 : unusedSubType, osMinTmp, osMaxTmp) &&
252 8 : bFoundXMin && eXMinType == OFTReal &&
253 22 : poLayer->GetMinMaxForParquetCol(
254 : iRowGroup, iParquetYMin, nullptr,
255 : /* bComputeMin = */ true, sYMin, bFoundYMin,
256 : /* bComputeMax = */ false, unusedF, unusedB, eYMinType,
257 8 : unusedSubType, osMinTmp, osMaxTmp) &&
258 8 : bFoundYMin && eYMinType == OFTReal &&
259 22 : poLayer->GetMinMaxForParquetCol(
260 : iRowGroup, iParquetXMax, nullptr,
261 : /* bComputeMin = */ false, unusedF, unusedB,
262 : /* bComputeMax = */ true, sXMax, bFoundXMax, eXMaxType,
263 8 : unusedSubType, osMaxTmp, osMaxTmp) &&
264 8 : bFoundXMax && eXMaxType == OFTReal &&
265 22 : poLayer->GetMinMaxForParquetCol(
266 : iRowGroup, iParquetYMax, nullptr,
267 : /* bComputeMin = */ false, unusedF, unusedB,
268 : /* bComputeMax = */ true, sYMax, bFoundYMax, eYMaxType,
269 8 : unusedSubType, osMaxTmp, osMaxTmp) &&
270 22 : bFoundYMax && eYMaxType == OFTReal)
271 : {
272 16 : OGRFeature oFeat(poMemLayer->GetLayerDefn());
273 8 : oFeat.SetField(0,
274 : static_cast<GIntBig>(
275 8 : metadata->RowGroup(iRowGroup)->num_rows()));
276 16 : auto poPoly = std::make_unique<OGRPolygon>();
277 8 : auto poLR = std::make_unique<OGRLinearRing>();
278 8 : poLR->addPoint(sXMin.Real, sYMin.Real);
279 8 : poLR->addPoint(sXMin.Real, sYMax.Real);
280 8 : poLR->addPoint(sXMax.Real, sYMax.Real);
281 8 : poLR->addPoint(sXMax.Real, sYMin.Real);
282 8 : poLR->addPoint(sXMin.Real, sYMin.Real);
283 8 : poPoly->addRingDirectly(poLR.release());
284 8 : oFeat.SetGeometryDirectly(poPoly.release());
285 8 : CPL_IGNORE_RET_VAL(poMemLayer->CreateFeature(&oFeat));
286 : }
287 : }
288 :
289 1 : return poMemDS.release();
290 : }
291 0 : return nullptr;
292 : }
293 :
294 : /************************************************************************/
295 : /* Open() */
296 : /************************************************************************/
297 :
298 1603 : static GDALDataset *OGRParquetDriverOpen(GDALOpenInfo *poOpenInfo)
299 : {
300 1603 : if (poOpenInfo->eAccess == GA_Update)
301 61 : return nullptr;
302 :
303 : #ifdef GDAL_USE_ARROWDATASET
304 3084 : std::string osBasePath(poOpenInfo->pszFilename);
305 3084 : std::string osQueryParameters;
306 : const bool bStartedWithParquetPrefix =
307 1542 : STARTS_WITH(osBasePath.c_str(), "PARQUET:");
308 :
309 1542 : if (bStartedWithParquetPrefix)
310 : {
311 261 : osBasePath = osBasePath.substr(strlen("PARQUET:"));
312 : }
313 :
314 : // Little trick to allow using syntax of
315 : // https://github.com/opengeospatial/geoparquet/discussions/101
316 : // ogrinfo
317 : // "/vsicurl/https://ai4edataeuwest.blob.core.windows.net/us-census/2020/cb_2020_us_vtd_500k.parquet?${SAS_TOKEN}"
318 1542 : if (STARTS_WITH(osBasePath.c_str(), "/vsicurl/"))
319 : {
320 2 : const auto nPos = osBasePath.find(".parquet?st=");
321 2 : if (nPos != std::string::npos)
322 : {
323 0 : osQueryParameters = osBasePath.substr(nPos + strlen(".parquet"));
324 0 : osBasePath.resize(nPos + strlen(".parquet"));
325 : }
326 : }
327 :
328 2390 : if (bStartedWithParquetPrefix || poOpenInfo->bIsDirectory ||
329 848 : !osQueryParameters.empty())
330 : {
331 : VSIStatBufL sStat;
332 694 : if (!osBasePath.empty() && osBasePath.back() == '/')
333 0 : osBasePath.pop_back();
334 : const std::string osMetadataPath =
335 694 : CPLFormFilenameSafe(osBasePath.c_str(), "_metadata", nullptr);
336 694 : if (CPLTestBool(
337 2082 : CPLGetConfigOption("OGR_PARQUET_USE_METADATA_FILE", "YES")) &&
338 1388 : VSIStatL((osMetadataPath + osQueryParameters).c_str(), &sStat) == 0)
339 : {
340 : // If there's a _metadata file, then use it to avoid listing files
341 : try
342 : {
343 36 : return OpenParquetDatasetWithMetadata(
344 : osBasePath, "_metadata", osQueryParameters,
345 18 : poOpenInfo->papszOpenOptions);
346 : }
347 0 : catch (const std::exception &e)
348 : {
349 0 : CPLError(CE_Failure, CPLE_AppDefined, "Parquet exception: %s",
350 0 : e.what());
351 : }
352 0 : return nullptr;
353 : }
354 : else
355 : {
356 676 : bool bLikelyParquetDataset = false;
357 676 : if (poOpenInfo->bIsDirectory)
358 : {
359 : // Detect if the directory contains .parquet files, or
360 : // subdirectories with a name of the form "key=value", typical
361 : // of HIVE partitioning.
362 846 : const CPLStringList aosFiles(VSIReadDir(osBasePath.c_str()));
363 22266 : for (const char *pszFilename : cpl::Iterate(aosFiles))
364 : {
365 21845 : if (EQUAL(CPLGetExtensionSafe(pszFilename).c_str(),
366 : "parquet"))
367 : {
368 2 : bLikelyParquetDataset = true;
369 2 : break;
370 : }
371 21843 : else if (strchr(pszFilename, '='))
372 : {
373 : // HIVE partitioning
374 0 : if (VSIStatL(CPLFormFilenameSafe(osBasePath.c_str(),
375 : pszFilename, nullptr)
376 : .c_str(),
377 0 : &sStat) == 0 &&
378 0 : VSI_ISDIR(sStat.st_mode))
379 : {
380 0 : bLikelyParquetDataset = true;
381 0 : break;
382 : }
383 : }
384 : }
385 : }
386 :
387 676 : if (bStartedWithParquetPrefix || bLikelyParquetDataset)
388 : {
389 : try
390 : {
391 510 : return OpenParquetDatasetWithoutMetadata(
392 : osBasePath, osQueryParameters,
393 255 : poOpenInfo->papszOpenOptions);
394 : }
395 0 : catch (const std::exception &e)
396 : {
397 : // If we aren't quite sure that the passed file name is
398 : // a directory, then silently continue
399 0 : if (poOpenInfo->bIsDirectory)
400 : {
401 0 : CPLError(CE_Failure, CPLE_AppDefined,
402 0 : "Parquet exception: %s", e.what());
403 0 : return nullptr;
404 : }
405 : }
406 : }
407 : }
408 : }
409 : #endif
410 :
411 1269 : if (!OGRParquetDriverIdentify(poOpenInfo))
412 : {
413 0 : return nullptr;
414 : }
415 :
416 1269 : if (poOpenInfo->bIsDirectory)
417 421 : return nullptr;
418 :
419 1696 : std::string osFilename(poOpenInfo->pszFilename);
420 848 : if (STARTS_WITH(poOpenInfo->pszFilename, "PARQUET:"))
421 : {
422 0 : osFilename = poOpenInfo->pszFilename + strlen("PARQUET:");
423 : }
424 :
425 : try
426 : {
427 848 : std::shared_ptr<arrow::io::RandomAccessFile> infile;
428 1312 : if (STARTS_WITH(osFilename.c_str(), "/vsi") ||
429 464 : CPLTestBool(CPLGetConfigOption("OGR_PARQUET_USE_VSI", "NO")))
430 : {
431 384 : VSIVirtualHandleUniquePtr fp(poOpenInfo->fpL);
432 384 : poOpenInfo->fpL = nullptr;
433 384 : if (fp == nullptr)
434 : {
435 0 : fp.reset(VSIFOpenL(osFilename.c_str(), "rb"));
436 0 : if (fp == nullptr)
437 0 : return nullptr;
438 : }
439 768 : infile = std::make_shared<OGRArrowRandomAccessFile>(osFilename,
440 768 : std::move(fp));
441 : }
442 : else
443 : {
444 464 : PARQUET_ASSIGN_OR_THROW(infile,
445 : arrow::io::ReadableFile::Open(osFilename));
446 : }
447 :
448 : // Open Parquet file reader
449 848 : std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
450 : auto poMemoryPool = std::shared_ptr<arrow::MemoryPool>(
451 1696 : arrow::MemoryPool::CreateDefault().release());
452 : #if ARROW_VERSION_MAJOR >= 19
453 2544 : PARQUET_ASSIGN_OR_THROW(
454 : arrow_reader,
455 : parquet::arrow::OpenFile(std::move(infile), poMemoryPool.get()));
456 : #else
457 : auto st = parquet::arrow::OpenFile(std::move(infile),
458 : poMemoryPool.get(), &arrow_reader);
459 : if (!st.ok())
460 : {
461 : CPLError(CE_Failure, CPLE_AppDefined,
462 : "parquet::arrow::OpenFile() failed");
463 : return nullptr;
464 : }
465 : #endif
466 :
467 1694 : auto poDS = std::make_unique<OGRParquetDataset>(poMemoryPool);
468 : auto poLayer = std::make_unique<OGRParquetLayer>(
469 1694 : poDS.get(), CPLGetBasenameSafe(osFilename.c_str()).c_str(),
470 2541 : std::move(arrow_reader), poOpenInfo->papszOpenOptions);
471 :
472 : // For debug purposes: return a layer with the extent of each row group
473 847 : if (CPLTestBool(
474 : CPLGetConfigOption("OGR_PARQUET_SHOW_ROW_GROUP_EXTENT", "NO")))
475 : {
476 1 : return BuildMemDatasetWithRowGroupExtents(poLayer.get());
477 : }
478 :
479 846 : poDS->SetLayer(std::move(poLayer));
480 846 : return poDS.release();
481 : }
482 1 : catch (const std::exception &e)
483 : {
484 1 : CPLError(CE_Failure, CPLE_AppDefined, "Parquet exception: %s",
485 1 : e.what());
486 1 : return nullptr;
487 : }
488 : }
489 :
490 : /************************************************************************/
491 : /* Create() */
492 : /************************************************************************/
493 :
494 264 : static GDALDataset *OGRParquetDriverCreate(const char *pszName, int nXSize,
495 : int nYSize, int nBands,
496 : GDALDataType eType,
497 : char ** /* papszOptions */)
498 : {
499 264 : if (!(nXSize == 0 && nYSize == 0 && nBands == 0 && eType == GDT_Unknown))
500 0 : return nullptr;
501 :
502 : try
503 : {
504 264 : std::shared_ptr<arrow::io::OutputStream> out_file;
505 342 : if (STARTS_WITH(pszName, "/vsi") ||
506 78 : CPLTestBool(CPLGetConfigOption("OGR_PARQUET_USE_VSI", "YES")))
507 : {
508 264 : VSILFILE *fp = VSIFOpenL(pszName, "wb");
509 264 : if (fp == nullptr)
510 : {
511 1 : CPLError(CE_Failure, CPLE_FileIO, "Cannot create %s", pszName);
512 1 : return nullptr;
513 : }
514 263 : out_file = std::make_shared<OGRArrowWritableFile>(fp);
515 : }
516 : else
517 : {
518 0 : PARQUET_ASSIGN_OR_THROW(out_file,
519 : arrow::io::FileOutputStream::Open(pszName));
520 : }
521 :
522 263 : return new OGRParquetWriterDataset(out_file);
523 : }
524 0 : catch (const std::exception &e)
525 : {
526 0 : CPLError(CE_Failure, CPLE_AppDefined, "Parquet exception: %s",
527 0 : e.what());
528 0 : return nullptr;
529 : }
530 : }
531 :
532 : /************************************************************************/
533 : /* OGRParquetDriver() */
534 : /************************************************************************/
535 :
536 : class OGRParquetDriver final : public GDALDriver
537 : {
538 : bool m_bMetadataInitialized = false;
539 : void InitMetadata();
540 :
541 : public:
542 1756 : const char *GetMetadataItem(const char *pszName,
543 : const char *pszDomain) override
544 : {
545 1756 : if (EQUAL(pszName, GDAL_DS_LAYER_CREATIONOPTIONLIST))
546 : {
547 296 : InitMetadata();
548 : }
549 1756 : return GDALDriver::GetMetadataItem(pszName, pszDomain);
550 : }
551 :
552 40 : char **GetMetadata(const char *pszDomain) override
553 : {
554 40 : InitMetadata();
555 40 : return GDALDriver::GetMetadata(pszDomain);
556 : }
557 : };
558 :
559 336 : void OGRParquetDriver::InitMetadata()
560 : {
561 336 : if (m_bMetadataInitialized)
562 321 : return;
563 15 : m_bMetadataInitialized = true;
564 :
565 : CPLXMLTreeCloser oTree(
566 30 : CPLCreateXMLNode(nullptr, CXT_Element, "LayerCreationOptionList"));
567 :
568 30 : std::vector<const char *> apszCompressionMethods;
569 15 : bool bHasSnappy = false;
570 105 : for (const char *pszMethod :
571 120 : {"SNAPPY", "GZIP", "BROTLI", "ZSTD", "LZ4_RAW", "LZO", "LZ4_HADOOP"})
572 : {
573 : auto oResult = arrow::util::Codec::GetCompressionType(
574 210 : CPLString(pszMethod).tolower());
575 105 : if (oResult.ok() && arrow::util::Codec::IsAvailable(*oResult))
576 : {
577 90 : if (EQUAL(pszMethod, "SNAPPY"))
578 15 : bHasSnappy = true;
579 90 : apszCompressionMethods.emplace_back(pszMethod);
580 : }
581 : }
582 :
583 : {
584 15 : auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
585 15 : CPLAddXMLAttributeAndValue(psOption, "name", "COMPRESSION");
586 15 : CPLAddXMLAttributeAndValue(psOption, "type", "string-select");
587 15 : CPLAddXMLAttributeAndValue(psOption, "description",
588 : "Compression method");
589 15 : CPLAddXMLAttributeAndValue(psOption, "default",
590 : bHasSnappy ? "SNAPPY" : "NONE");
591 : {
592 15 : auto poValueNode = CPLCreateXMLNode(psOption, CXT_Element, "Value");
593 15 : CPLAddXMLAttributeAndValue(poValueNode, "alias", "UNCOMPRESSED");
594 15 : CPLCreateXMLNode(poValueNode, CXT_Text, "NONE");
595 : }
596 105 : for (const char *pszMethod : apszCompressionMethods)
597 : {
598 90 : auto poValueNode = CPLCreateXMLNode(psOption, CXT_Element, "Value");
599 90 : CPLCreateXMLNode(poValueNode, CXT_Text, pszMethod);
600 : }
601 : }
602 :
603 : {
604 15 : auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
605 15 : CPLAddXMLAttributeAndValue(psOption, "name", "GEOMETRY_ENCODING");
606 15 : CPLAddXMLAttributeAndValue(psOption, "type", "string-select");
607 15 : CPLAddXMLAttributeAndValue(psOption, "description",
608 : "Encoding of geometry columns");
609 15 : CPLAddXMLAttributeAndValue(psOption, "default", "WKB");
610 60 : for (const char *pszEncoding :
611 75 : {"WKB", "WKT", "GEOARROW", "GEOARROW_INTERLEAVED"})
612 : {
613 60 : auto poValueNode = CPLCreateXMLNode(psOption, CXT_Element, "Value");
614 60 : CPLCreateXMLNode(poValueNode, CXT_Text, pszEncoding);
615 60 : if (EQUAL(pszEncoding, "GEOARROW"))
616 15 : CPLAddXMLAttributeAndValue(poValueNode, "alias",
617 : "GEOARROW_STRUCT");
618 : }
619 : }
620 :
621 : {
622 15 : auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
623 15 : CPLAddXMLAttributeAndValue(psOption, "name", "ROW_GROUP_SIZE");
624 15 : CPLAddXMLAttributeAndValue(psOption, "type", "integer");
625 15 : CPLAddXMLAttributeAndValue(psOption, "description",
626 : "Maximum number of rows per group");
627 15 : CPLAddXMLAttributeAndValue(psOption, "default", "65536");
628 : }
629 :
630 : {
631 15 : auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
632 15 : CPLAddXMLAttributeAndValue(psOption, "name", "GEOMETRY_NAME");
633 15 : CPLAddXMLAttributeAndValue(psOption, "type", "string");
634 15 : CPLAddXMLAttributeAndValue(psOption, "description",
635 : "Name of geometry column");
636 15 : CPLAddXMLAttributeAndValue(psOption, "default", "geometry");
637 : }
638 :
639 : {
640 15 : auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
641 15 : CPLAddXMLAttributeAndValue(psOption, "name", "COORDINATE_PRECISION");
642 15 : CPLAddXMLAttributeAndValue(psOption, "type", "float");
643 15 : CPLAddXMLAttributeAndValue(psOption, "description",
644 : "Number of decimals for coordinates (only "
645 : "for GEOMETRY_ENCODING=WKT)");
646 : }
647 :
648 : {
649 15 : auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
650 15 : CPLAddXMLAttributeAndValue(psOption, "name", "FID");
651 15 : CPLAddXMLAttributeAndValue(psOption, "type", "string");
652 15 : CPLAddXMLAttributeAndValue(psOption, "description",
653 : "Name of the FID column to create");
654 : }
655 :
656 : {
657 15 : auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
658 15 : CPLAddXMLAttributeAndValue(psOption, "name", "POLYGON_ORIENTATION");
659 15 : CPLAddXMLAttributeAndValue(psOption, "type", "string-select");
660 15 : CPLAddXMLAttributeAndValue(
661 : psOption, "description",
662 : "Which ring orientation to use for polygons");
663 15 : CPLAddXMLAttributeAndValue(psOption, "default", "COUNTERCLOCKWISE");
664 15 : CPLCreateXMLElementAndValue(psOption, "Value", "COUNTERCLOCKWISE");
665 15 : CPLCreateXMLElementAndValue(psOption, "Value", "UNMODIFIED");
666 : }
667 :
668 : {
669 15 : auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
670 15 : CPLAddXMLAttributeAndValue(psOption, "name", "EDGES");
671 15 : CPLAddXMLAttributeAndValue(psOption, "type", "string-select");
672 15 : CPLAddXMLAttributeAndValue(
673 : psOption, "description",
674 : "Name of the coordinate system for the edges");
675 15 : CPLAddXMLAttributeAndValue(psOption, "default", "PLANAR");
676 15 : CPLCreateXMLElementAndValue(psOption, "Value", "PLANAR");
677 15 : CPLCreateXMLElementAndValue(psOption, "Value", "SPHERICAL");
678 : }
679 :
680 : {
681 15 : auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
682 15 : CPLAddXMLAttributeAndValue(psOption, "name", "CREATOR");
683 15 : CPLAddXMLAttributeAndValue(psOption, "type", "string");
684 15 : CPLAddXMLAttributeAndValue(psOption, "description",
685 : "Name of creating application");
686 : }
687 :
688 : {
689 15 : auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
690 15 : CPLAddXMLAttributeAndValue(psOption, "name", "WRITE_COVERING_BBOX");
691 15 : CPLAddXMLAttributeAndValue(psOption, "type", "boolean");
692 15 : CPLAddXMLAttributeAndValue(psOption, "default", "YES");
693 15 : CPLAddXMLAttributeAndValue(psOption, "description",
694 : "Whether to write xmin/ymin/xmax/ymax "
695 : "columns with the bounding box of "
696 : "geometries");
697 : }
698 :
699 : {
700 15 : auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
701 15 : CPLAddXMLAttributeAndValue(psOption, "name", "SORT_BY_BBOX");
702 15 : CPLAddXMLAttributeAndValue(psOption, "type", "boolean");
703 15 : CPLAddXMLAttributeAndValue(psOption, "default", "NO");
704 15 : CPLAddXMLAttributeAndValue(psOption, "description",
705 : "Whether features should be sorted based on "
706 : "the bounding box of their geometries");
707 : }
708 :
709 15 : char *pszXML = CPLSerializeXMLTree(oTree.get());
710 15 : GDALDriver::SetMetadataItem(GDAL_DS_LAYER_CREATIONOPTIONLIST, pszXML);
711 15 : CPLFree(pszXML);
712 : }
713 :
714 : /************************************************************************/
715 : /* RegisterOGRParquet() */
716 : /************************************************************************/
717 :
718 30 : void RegisterOGRParquet()
719 : {
720 30 : if (GDALGetDriverByName(DRIVER_NAME) != nullptr)
721 0 : return;
722 :
723 60 : auto poDriver = std::make_unique<OGRParquetDriver>();
724 30 : OGRParquetDriverSetCommonMetadata(poDriver.get());
725 :
726 30 : poDriver->pfnOpen = OGRParquetDriverOpen;
727 30 : poDriver->pfnCreate = OGRParquetDriverCreate;
728 :
729 30 : poDriver->SetMetadataItem("ARROW_VERSION", ARROW_VERSION_STRING);
730 : #ifdef GDAL_USE_ARROWDATASET
731 30 : poDriver->SetMetadataItem("ARROW_DATASET", "YES");
732 : #endif
733 :
734 30 : GetGDALDriverManager()->RegisterDriver(poDriver.release());
735 :
736 : #if ARROW_VERSION_MAJOR >= 16
737 : // Mostly for tests
738 : const char *pszPath =
739 30 : CPLGetConfigOption("OGR_PARQUET_LOAD_FILE_SYSTEM_FACTORIES", nullptr);
740 30 : if (pszPath)
741 : {
742 0 : auto result = arrow::fs::LoadFileSystemFactories(pszPath);
743 0 : if (!result.ok())
744 : {
745 0 : CPLError(CE_Warning, CPLE_AppDefined,
746 : "arrow::fs::LoadFileSystemFactories() failed with %s",
747 0 : result.message().c_str());
748 : }
749 : }
750 : #endif
751 : }
|