Line data Source code
1 : /******************************************************************************
2 : *
3 : * Project: Parquet Translator
4 : * Purpose: Implements OGRParquetDriver.
5 : * Author: Even Rouault, <even.rouault at spatialys.com>
6 : *
7 : ******************************************************************************
8 : * Copyright (c) 2022, Planet Labs
9 : *
10 : * SPDX-License-Identifier: MIT
11 : ****************************************************************************/
12 :
13 : #include "gdal_pam.h"
14 : #include "ogrsf_frmts.h"
15 :
16 : #include <algorithm>
17 : #include <map>
18 : #include <mutex>
19 : #include <tuple>
20 :
21 : #include "ogr_parquet.h"
22 : #include "ogrparquetdrivercore.h"
23 : #include "memdataset.h"
24 :
25 : #include "../arrow_common/ograrrowrandomaccessfile.h"
26 : #include "../arrow_common/vsiarrowfilesystem.hpp"
27 : #include "../arrow_common/ograrrowwritablefile.h"
28 : #include "../arrow_common/ograrrowdataset.hpp"
29 : #include "../arrow_common/ograrrowlayer.hpp" // for the destructor
30 :
31 : #ifdef GDAL_USE_ARROWDATASET
32 :
33 : /************************************************************************/
34 : /* OpenFromDatasetFactory() */
35 : /************************************************************************/
36 :
37 273 : static GDALDataset *OpenFromDatasetFactory(
38 : const std::string &osBasePath,
39 : const std::shared_ptr<arrow::dataset::DatasetFactory> &factory,
40 : CSLConstList papszOpenOptions,
41 : const std::shared_ptr<arrow::fs::FileSystem> &fs)
42 : {
43 273 : std::shared_ptr<arrow::dataset::Dataset> dataset;
44 546 : PARQUET_ASSIGN_OR_THROW(dataset, factory->Finish());
45 :
46 : auto poMemoryPool = std::shared_ptr<arrow::MemoryPool>(
47 546 : arrow::MemoryPool::CreateDefault().release());
48 :
49 273 : const bool bIsVSI = STARTS_WITH(osBasePath.c_str(), "/vsi");
50 546 : auto poDS = std::make_unique<OGRParquetDataset>(poMemoryPool);
51 : auto poLayer = std::make_unique<OGRParquetDatasetLayer>(
52 546 : poDS.get(), CPLGetBasenameSafe(osBasePath.c_str()).c_str(), bIsVSI,
53 546 : dataset, papszOpenOptions);
54 273 : poDS->SetLayer(std::move(poLayer));
55 273 : poDS->SetFileSystem(fs);
56 546 : return poDS.release();
57 : }
58 :
59 : /************************************************************************/
60 : /* GetFileSystem() */
61 : /************************************************************************/
62 :
63 : static std::tuple<std::shared_ptr<arrow::fs::FileSystem>, std::string>
64 273 : GetFileSystem(std::string &osBasePathInOut,
65 : const std::string &osQueryParameters)
66 : {
67 : // Instantiate file system:
68 : // - VSIArrowFileSystem implementation for /vsi files
69 : // - base implementation for local files (if OGR_PARQUET_USE_VSI set to NO)
70 273 : std::shared_ptr<arrow::fs::FileSystem> fs;
71 273 : const bool bIsVSI = STARTS_WITH(osBasePathInOut.c_str(), "/vsi");
72 : VSIStatBufL sStat;
73 546 : std::string osFSFilename;
74 459 : if ((bIsVSI ||
75 538 : CPLTestBool(CPLGetConfigOption("OGR_PARQUET_USE_VSI", "YES"))) &&
76 265 : VSIStatL(osBasePathInOut.c_str(), &sStat) == 0)
77 : {
78 264 : osFSFilename = osBasePathInOut;
79 264 : fs = std::make_shared<VSIArrowFileSystem>("PARQUET", osQueryParameters);
80 : }
81 : else
82 : {
83 : // FileSystemFromUriOrPath() doesn't like relative paths
84 : // so transform them to absolute.
85 9 : std::string osPath(osBasePathInOut);
86 9 : if (CPLIsFilenameRelative(osPath.c_str()))
87 : {
88 8 : char *pszCurDir = CPLGetCurrentDir();
89 8 : if (pszCurDir == nullptr)
90 0 : return {nullptr, osFSFilename};
91 8 : osPath = CPLFormFilenameSafe(pszCurDir, osPath.c_str(), nullptr);
92 8 : CPLFree(pszCurDir);
93 : }
94 9 : PARQUET_ASSIGN_OR_THROW(
95 : fs, arrow::fs::FileSystemFromUriOrPath(osPath, &osFSFilename));
96 : }
97 273 : return {fs, osFSFilename};
98 : }
99 :
100 : /************************************************************************/
101 : /* OpenParquetDatasetWithMetadata() */
102 : /************************************************************************/
103 :
104 18 : static GDALDataset *OpenParquetDatasetWithMetadata(
105 : const std::string &osBasePathIn, const char *pszMetadataFile,
106 : const std::string &osQueryParameters, CSLConstList papszOpenOptions)
107 : {
108 36 : std::string osBasePath(osBasePathIn);
109 18 : const auto &[fs, osFSFilename] =
110 36 : GetFileSystem(osBasePath, osQueryParameters);
111 :
112 36 : arrow::dataset::ParquetFactoryOptions options;
113 36 : auto partitioningFactory = arrow::dataset::HivePartitioning::MakeFactory();
114 : options.partitioning =
115 18 : arrow::dataset::PartitioningOrFactory(std::move(partitioningFactory));
116 :
117 18 : std::shared_ptr<arrow::dataset::DatasetFactory> factory;
118 : // coverity[copy_constructor_call]
119 54 : PARQUET_ASSIGN_OR_THROW(
120 : factory, arrow::dataset::ParquetDatasetFactory::Make(
121 : osFSFilename + '/' + pszMetadataFile, fs,
122 : std::make_shared<arrow::dataset::ParquetFileFormat>(),
123 : std::move(options)));
124 :
125 36 : return OpenFromDatasetFactory(osBasePath, factory, papszOpenOptions, fs);
126 : }
127 :
128 : /************************************************************************/
129 : /* OpenParquetDatasetWithoutMetadata() */
130 : /************************************************************************/
131 :
132 : static GDALDataset *
133 255 : OpenParquetDatasetWithoutMetadata(const std::string &osBasePathIn,
134 : const std::string &osQueryParameters,
135 : CSLConstList papszOpenOptions)
136 : {
137 510 : std::string osBasePath(osBasePathIn);
138 255 : const auto &[fs, osFSFilename] =
139 510 : GetFileSystem(osBasePath, osQueryParameters);
140 :
141 510 : arrow::dataset::FileSystemFactoryOptions options;
142 255 : std::shared_ptr<arrow::dataset::DatasetFactory> factory;
143 :
144 510 : const auto fileInfo = fs->GetFileInfo(osFSFilename);
145 255 : if (fileInfo->IsFile())
146 : {
147 : // coverity[copy_constructor_call]
148 1008 : PARQUET_ASSIGN_OR_THROW(
149 : factory, arrow::dataset::FileSystemDatasetFactory::Make(
150 : fs, {std::move(osFSFilename)},
151 : std::make_shared<arrow::dataset::ParquetFileFormat>(),
152 : std::move(options)));
153 : }
154 : else
155 : {
156 : auto partitioningFactory =
157 6 : arrow::dataset::HivePartitioning::MakeFactory();
158 6 : options.partitioning = arrow::dataset::PartitioningOrFactory(
159 6 : std::move(partitioningFactory));
160 :
161 6 : arrow::fs::FileSelector selector;
162 3 : selector.base_dir = std::move(osFSFilename);
163 3 : selector.recursive = true;
164 :
165 : // coverity[copy_constructor_call]
166 6 : PARQUET_ASSIGN_OR_THROW(
167 : factory, arrow::dataset::FileSystemDatasetFactory::Make(
168 : fs, std::move(selector),
169 : std::make_shared<arrow::dataset::ParquetFileFormat>(),
170 : std::move(options)));
171 : }
172 :
173 510 : return OpenFromDatasetFactory(osBasePath, factory, papszOpenOptions, fs);
174 : }
175 :
176 : #endif
177 :
178 : /************************************************************************/
179 : /* BuildMemDatasetWithRowGroupExtents() */
180 : /************************************************************************/
181 :
182 : /** Builds a MEM dataset that contains, for each row-group of the input file,
183 : * the feature count and spatial extent of the features of this row group,
184 : * using Parquet statistics. This assumes that the Parquet file declares
185 : * a "covering":{"bbox":{ ... }} metadata item.
186 : *
187 : * Only for debug purposes.
188 : */
189 1 : static GDALDataset *BuildMemDatasetWithRowGroupExtents(OGRParquetLayer *poLayer)
190 : {
191 1 : int iParquetXMin = -1;
192 1 : int iParquetYMin = -1;
193 1 : int iParquetXMax = -1;
194 1 : int iParquetYMax = -1;
195 1 : if (poLayer->GeomColsBBOXParquet(0, iParquetXMin, iParquetYMin,
196 : iParquetXMax, iParquetYMax))
197 : {
198 : auto poMemDS = std::unique_ptr<GDALDataset>(
199 2 : MEMDataset::Create("", 0, 0, 0, GDT_Unknown, nullptr));
200 1 : if (!poMemDS)
201 0 : return nullptr;
202 1 : OGRSpatialReference *poTmpSRS = nullptr;
203 1 : const auto poSrcSRS = poLayer->GetSpatialRef();
204 1 : if (poSrcSRS)
205 0 : poTmpSRS = poSrcSRS->Clone();
206 : auto poMemLayer =
207 1 : poMemDS->CreateLayer("footprint", poTmpSRS, wkbPolygon, nullptr);
208 1 : if (poTmpSRS)
209 0 : poTmpSRS->Release();
210 1 : if (!poMemLayer)
211 0 : return nullptr;
212 1 : poMemLayer->CreateField(
213 1 : std::make_unique<OGRFieldDefn>("feature_count", OFTInteger64)
214 1 : .get());
215 :
216 : const auto metadata =
217 2 : poLayer->GetReader()->parquet_reader()->metadata();
218 1 : const int numRowGroups = metadata->num_row_groups();
219 15 : for (int iRowGroup = 0; iRowGroup < numRowGroups; ++iRowGroup)
220 : {
221 28 : std::string osMinTmp, osMaxTmp;
222 : OGRField unusedF;
223 : bool unusedB;
224 : OGRFieldSubType unusedSubType;
225 :
226 : OGRField sXMin;
227 14 : OGR_RawField_SetNull(&sXMin);
228 14 : bool bFoundXMin = false;
229 14 : OGRFieldType eXMinType = OFTMaxType;
230 :
231 : OGRField sYMin;
232 14 : OGR_RawField_SetNull(&sYMin);
233 14 : bool bFoundYMin = false;
234 14 : OGRFieldType eYMinType = OFTMaxType;
235 :
236 : OGRField sXMax;
237 14 : OGR_RawField_SetNull(&sXMax);
238 14 : bool bFoundXMax = false;
239 14 : OGRFieldType eXMaxType = OFTMaxType;
240 :
241 : OGRField sYMax;
242 14 : OGR_RawField_SetNull(&sYMax);
243 14 : bool bFoundYMax = false;
244 14 : OGRFieldType eYMaxType = OFTMaxType;
245 :
246 14 : if (poLayer->GetMinMaxForParquetCol(
247 : iRowGroup, iParquetXMin, nullptr,
248 : /* bComputeMin = */ true, sXMin, bFoundXMin,
249 : /* bComputeMax = */ false, unusedF, unusedB, eXMinType,
250 8 : unusedSubType, osMinTmp, osMaxTmp) &&
251 8 : bFoundXMin && eXMinType == OFTReal &&
252 22 : poLayer->GetMinMaxForParquetCol(
253 : iRowGroup, iParquetYMin, nullptr,
254 : /* bComputeMin = */ true, sYMin, bFoundYMin,
255 : /* bComputeMax = */ false, unusedF, unusedB, eYMinType,
256 8 : unusedSubType, osMinTmp, osMaxTmp) &&
257 8 : bFoundYMin && eYMinType == OFTReal &&
258 22 : poLayer->GetMinMaxForParquetCol(
259 : iRowGroup, iParquetXMax, nullptr,
260 : /* bComputeMin = */ false, unusedF, unusedB,
261 : /* bComputeMax = */ true, sXMax, bFoundXMax, eXMaxType,
262 8 : unusedSubType, osMaxTmp, osMaxTmp) &&
263 8 : bFoundXMax && eXMaxType == OFTReal &&
264 22 : poLayer->GetMinMaxForParquetCol(
265 : iRowGroup, iParquetYMax, nullptr,
266 : /* bComputeMin = */ false, unusedF, unusedB,
267 : /* bComputeMax = */ true, sYMax, bFoundYMax, eYMaxType,
268 8 : unusedSubType, osMaxTmp, osMaxTmp) &&
269 22 : bFoundYMax && eYMaxType == OFTReal)
270 : {
271 16 : OGRFeature oFeat(poMemLayer->GetLayerDefn());
272 8 : oFeat.SetField(0,
273 : static_cast<GIntBig>(
274 8 : metadata->RowGroup(iRowGroup)->num_rows()));
275 16 : auto poPoly = std::make_unique<OGRPolygon>();
276 8 : auto poLR = std::make_unique<OGRLinearRing>();
277 8 : poLR->addPoint(sXMin.Real, sYMin.Real);
278 8 : poLR->addPoint(sXMin.Real, sYMax.Real);
279 8 : poLR->addPoint(sXMax.Real, sYMax.Real);
280 8 : poLR->addPoint(sXMax.Real, sYMin.Real);
281 8 : poLR->addPoint(sXMin.Real, sYMin.Real);
282 8 : poPoly->addRingDirectly(poLR.release());
283 8 : oFeat.SetGeometryDirectly(poPoly.release());
284 8 : CPL_IGNORE_RET_VAL(poMemLayer->CreateFeature(&oFeat));
285 : }
286 : }
287 :
288 1 : return poMemDS.release();
289 : }
290 0 : return nullptr;
291 : }
292 :
293 : /************************************************************************/
294 : /* Open() */
295 : /************************************************************************/
296 :
297 1664 : static GDALDataset *OGRParquetDriverOpen(GDALOpenInfo *poOpenInfo)
298 : {
299 1664 : if (poOpenInfo->eAccess == GA_Update)
300 62 : return nullptr;
301 :
302 : #ifdef GDAL_USE_ARROWDATASET
303 3204 : std::string osBasePath(poOpenInfo->pszFilename);
304 3204 : std::string osQueryParameters;
305 : const bool bStartedWithParquetPrefix =
306 1602 : STARTS_WITH(osBasePath.c_str(), "PARQUET:");
307 :
308 1602 : if (bStartedWithParquetPrefix)
309 : {
310 261 : osBasePath = osBasePath.substr(strlen("PARQUET:"));
311 : }
312 :
313 : // Little trick to allow using syntax of
314 : // https://github.com/opengeospatial/geoparquet/discussions/101
315 : // ogrinfo
316 : // "/vsicurl/https://ai4edataeuwest.blob.core.windows.net/us-census/2020/cb_2020_us_vtd_500k.parquet?${SAS_TOKEN}"
317 1602 : if (STARTS_WITH(osBasePath.c_str(), "/vsicurl/"))
318 : {
319 2 : const auto nPos = osBasePath.find(".parquet?st=");
320 2 : if (nPos != std::string::npos)
321 : {
322 0 : osQueryParameters = osBasePath.substr(nPos + strlen(".parquet"));
323 0 : osBasePath.resize(nPos + strlen(".parquet"));
324 : }
325 : }
326 :
327 2465 : if (bStartedWithParquetPrefix || poOpenInfo->bIsDirectory ||
328 863 : !osQueryParameters.empty())
329 : {
330 : VSIStatBufL sStat;
331 739 : if (!osBasePath.empty() && osBasePath.back() == '/')
332 0 : osBasePath.pop_back();
333 : const std::string osMetadataPath =
334 739 : CPLFormFilenameSafe(osBasePath.c_str(), "_metadata", nullptr);
335 739 : if (CPLTestBool(
336 2217 : CPLGetConfigOption("OGR_PARQUET_USE_METADATA_FILE", "YES")) &&
337 1478 : VSIStatL((osMetadataPath + osQueryParameters).c_str(), &sStat) == 0)
338 : {
339 : // If there's a _metadata file, then use it to avoid listing files
340 : try
341 : {
342 36 : return OpenParquetDatasetWithMetadata(
343 : osBasePath, "_metadata", osQueryParameters,
344 18 : poOpenInfo->papszOpenOptions);
345 : }
346 0 : catch (const std::exception &e)
347 : {
348 0 : CPLError(CE_Failure, CPLE_AppDefined, "Parquet exception: %s",
349 0 : e.what());
350 : }
351 0 : return nullptr;
352 : }
353 : else
354 : {
355 721 : bool bLikelyParquetDataset = false;
356 721 : if (poOpenInfo->bIsDirectory)
357 : {
358 : // Detect if the directory contains .parquet files, or
359 : // subdirectories with a name of the form "key=value", typical
360 : // of HIVE partitioning.
361 936 : const CPLStringList aosFiles(VSIReadDir(osBasePath.c_str()));
362 22205 : for (const char *pszFilename : cpl::Iterate(aosFiles))
363 : {
364 21739 : if (EQUAL(CPLGetExtensionSafe(pszFilename).c_str(),
365 : "parquet"))
366 : {
367 2 : bLikelyParquetDataset = true;
368 2 : break;
369 : }
370 21737 : else if (strchr(pszFilename, '='))
371 : {
372 : // HIVE partitioning
373 0 : if (VSIStatL(CPLFormFilenameSafe(osBasePath.c_str(),
374 : pszFilename, nullptr)
375 : .c_str(),
376 0 : &sStat) == 0 &&
377 0 : VSI_ISDIR(sStat.st_mode))
378 : {
379 0 : bLikelyParquetDataset = true;
380 0 : break;
381 : }
382 : }
383 : }
384 : }
385 :
386 721 : if (bStartedWithParquetPrefix || bLikelyParquetDataset)
387 : {
388 : try
389 : {
390 510 : return OpenParquetDatasetWithoutMetadata(
391 : osBasePath, osQueryParameters,
392 255 : poOpenInfo->papszOpenOptions);
393 : }
394 0 : catch (const std::exception &e)
395 : {
396 : // If we aren't quite sure that the passed file name is
397 : // a directory, then silently continue
398 0 : if (poOpenInfo->bIsDirectory)
399 : {
400 0 : CPLError(CE_Failure, CPLE_AppDefined,
401 0 : "Parquet exception: %s", e.what());
402 0 : return nullptr;
403 : }
404 : }
405 : }
406 : }
407 : }
408 : #endif
409 :
410 1329 : if (!OGRParquetDriverIdentify(poOpenInfo))
411 : {
412 0 : return nullptr;
413 : }
414 :
415 1329 : if (poOpenInfo->bIsDirectory)
416 466 : return nullptr;
417 :
418 1726 : std::string osFilename(poOpenInfo->pszFilename);
419 863 : if (STARTS_WITH(poOpenInfo->pszFilename, "PARQUET:"))
420 : {
421 0 : osFilename = poOpenInfo->pszFilename + strlen("PARQUET:");
422 : }
423 :
424 : try
425 : {
426 863 : std::shared_ptr<arrow::io::RandomAccessFile> infile;
427 1341 : if (STARTS_WITH(osFilename.c_str(), "/vsi") ||
428 478 : CPLTestBool(CPLGetConfigOption("OGR_PARQUET_USE_VSI", "NO")))
429 : {
430 385 : VSIVirtualHandleUniquePtr fp(poOpenInfo->fpL);
431 385 : poOpenInfo->fpL = nullptr;
432 385 : if (fp == nullptr)
433 : {
434 0 : fp.reset(VSIFOpenL(osFilename.c_str(), "rb"));
435 0 : if (fp == nullptr)
436 0 : return nullptr;
437 : }
438 770 : infile = std::make_shared<OGRArrowRandomAccessFile>(osFilename,
439 770 : std::move(fp));
440 : }
441 : else
442 : {
443 478 : PARQUET_ASSIGN_OR_THROW(infile,
444 : arrow::io::ReadableFile::Open(osFilename));
445 : }
446 :
447 : // Open Parquet file reader
448 863 : std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
449 : auto poMemoryPool = std::shared_ptr<arrow::MemoryPool>(
450 1726 : arrow::MemoryPool::CreateDefault().release());
451 : #if ARROW_VERSION_MAJOR >= 19
452 2589 : PARQUET_ASSIGN_OR_THROW(
453 : arrow_reader,
454 : parquet::arrow::OpenFile(std::move(infile), poMemoryPool.get()));
455 : #else
456 : auto st = parquet::arrow::OpenFile(std::move(infile),
457 : poMemoryPool.get(), &arrow_reader);
458 : if (!st.ok())
459 : {
460 : CPLError(CE_Failure, CPLE_AppDefined,
461 : "parquet::arrow::OpenFile() failed");
462 : return nullptr;
463 : }
464 : #endif
465 :
466 1724 : auto poDS = std::make_unique<OGRParquetDataset>(poMemoryPool);
467 : auto poLayer = std::make_unique<OGRParquetLayer>(
468 1724 : poDS.get(), CPLGetBasenameSafe(osFilename.c_str()).c_str(),
469 2586 : std::move(arrow_reader), poOpenInfo->papszOpenOptions);
470 :
471 : // For debug purposes: return a layer with the extent of each row group
472 862 : if (CPLTestBool(
473 : CPLGetConfigOption("OGR_PARQUET_SHOW_ROW_GROUP_EXTENT", "NO")))
474 : {
475 1 : return BuildMemDatasetWithRowGroupExtents(poLayer.get());
476 : }
477 :
478 861 : poDS->SetLayer(std::move(poLayer));
479 861 : return poDS.release();
480 : }
481 1 : catch (const std::exception &e)
482 : {
483 1 : CPLError(CE_Failure, CPLE_AppDefined, "Parquet exception: %s",
484 1 : e.what());
485 1 : return nullptr;
486 : }
487 : }
488 :
489 : /************************************************************************/
490 : /* Create() */
491 : /************************************************************************/
492 :
493 269 : static GDALDataset *OGRParquetDriverCreate(const char *pszName, int nXSize,
494 : int nYSize, int nBands,
495 : GDALDataType eType,
496 : char ** /* papszOptions */)
497 : {
498 269 : if (!(nXSize == 0 && nYSize == 0 && nBands == 0 && eType == GDT_Unknown))
499 0 : return nullptr;
500 :
501 : try
502 : {
503 269 : std::shared_ptr<arrow::io::OutputStream> out_file;
504 351 : if (STARTS_WITH(pszName, "/vsi") ||
505 82 : CPLTestBool(CPLGetConfigOption("OGR_PARQUET_USE_VSI", "YES")))
506 : {
507 269 : VSILFILE *fp = VSIFOpenL(pszName, "wb");
508 269 : if (fp == nullptr)
509 : {
510 1 : CPLError(CE_Failure, CPLE_FileIO, "Cannot create %s", pszName);
511 1 : return nullptr;
512 : }
513 268 : out_file = std::make_shared<OGRArrowWritableFile>(fp);
514 : }
515 : else
516 : {
517 0 : PARQUET_ASSIGN_OR_THROW(out_file,
518 : arrow::io::FileOutputStream::Open(pszName));
519 : }
520 :
521 268 : return new OGRParquetWriterDataset(out_file);
522 : }
523 0 : catch (const std::exception &e)
524 : {
525 0 : CPLError(CE_Failure, CPLE_AppDefined, "Parquet exception: %s",
526 0 : e.what());
527 0 : return nullptr;
528 : }
529 : }
530 :
531 : /************************************************************************/
532 : /* OGRParquetDriver() */
533 : /************************************************************************/
534 :
535 : class OGRParquetDriver final : public GDALDriver
536 : {
537 : std::mutex m_oMutex{};
538 : bool m_bMetadataInitialized = false;
539 : void InitMetadata();
540 :
541 : public:
542 1990 : const char *GetMetadataItem(const char *pszName,
543 : const char *pszDomain) override
544 : {
545 3980 : std::lock_guard oLock(m_oMutex);
546 1990 : if (EQUAL(pszName, GDAL_DS_LAYER_CREATIONOPTIONLIST))
547 : {
548 301 : InitMetadata();
549 : }
550 3980 : return GDALDriver::GetMetadataItem(pszName, pszDomain);
551 : }
552 :
553 51 : char **GetMetadata(const char *pszDomain) override
554 : {
555 102 : std::lock_guard oLock(m_oMutex);
556 51 : InitMetadata();
557 102 : return GDALDriver::GetMetadata(pszDomain);
558 : }
559 : };
560 :
561 352 : void OGRParquetDriver::InitMetadata()
562 : {
563 352 : if (m_bMetadataInitialized)
564 329 : return;
565 23 : m_bMetadataInitialized = true;
566 :
567 : CPLXMLTreeCloser oTree(
568 46 : CPLCreateXMLNode(nullptr, CXT_Element, "LayerCreationOptionList"));
569 :
570 46 : std::vector<const char *> apszCompressionMethods;
571 23 : bool bHasSnappy = false;
572 161 : for (const char *pszMethod :
573 184 : {"SNAPPY", "GZIP", "BROTLI", "ZSTD", "LZ4_RAW", "LZO", "LZ4_HADOOP"})
574 : {
575 : auto oResult = arrow::util::Codec::GetCompressionType(
576 322 : CPLString(pszMethod).tolower());
577 161 : if (oResult.ok() && arrow::util::Codec::IsAvailable(*oResult))
578 : {
579 138 : if (EQUAL(pszMethod, "SNAPPY"))
580 23 : bHasSnappy = true;
581 138 : apszCompressionMethods.emplace_back(pszMethod);
582 : }
583 : }
584 :
585 : {
586 23 : auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
587 23 : CPLAddXMLAttributeAndValue(psOption, "name", "COMPRESSION");
588 23 : CPLAddXMLAttributeAndValue(psOption, "type", "string-select");
589 23 : CPLAddXMLAttributeAndValue(psOption, "description",
590 : "Compression method");
591 23 : CPLAddXMLAttributeAndValue(psOption, "default",
592 : bHasSnappy ? "SNAPPY" : "NONE");
593 : {
594 23 : auto poValueNode = CPLCreateXMLNode(psOption, CXT_Element, "Value");
595 23 : CPLAddXMLAttributeAndValue(poValueNode, "alias", "UNCOMPRESSED");
596 23 : CPLCreateXMLNode(poValueNode, CXT_Text, "NONE");
597 : }
598 161 : for (const char *pszMethod : apszCompressionMethods)
599 : {
600 138 : auto poValueNode = CPLCreateXMLNode(psOption, CXT_Element, "Value");
601 138 : CPLCreateXMLNode(poValueNode, CXT_Text, pszMethod);
602 : }
603 : }
604 :
605 : {
606 23 : auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
607 23 : CPLAddXMLAttributeAndValue(psOption, "name", "GEOMETRY_ENCODING");
608 23 : CPLAddXMLAttributeAndValue(psOption, "type", "string-select");
609 23 : CPLAddXMLAttributeAndValue(psOption, "description",
610 : "Encoding of geometry columns");
611 23 : CPLAddXMLAttributeAndValue(psOption, "default", "WKB");
612 92 : for (const char *pszEncoding :
613 115 : {"WKB", "WKT", "GEOARROW", "GEOARROW_INTERLEAVED"})
614 : {
615 92 : auto poValueNode = CPLCreateXMLNode(psOption, CXT_Element, "Value");
616 92 : CPLCreateXMLNode(poValueNode, CXT_Text, pszEncoding);
617 92 : if (EQUAL(pszEncoding, "GEOARROW"))
618 23 : CPLAddXMLAttributeAndValue(poValueNode, "alias",
619 : "GEOARROW_STRUCT");
620 : }
621 : }
622 :
623 : {
624 23 : auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
625 23 : CPLAddXMLAttributeAndValue(psOption, "name", "ROW_GROUP_SIZE");
626 23 : CPLAddXMLAttributeAndValue(psOption, "type", "integer");
627 23 : CPLAddXMLAttributeAndValue(psOption, "description",
628 : "Maximum number of rows per group");
629 23 : CPLAddXMLAttributeAndValue(psOption, "default", "65536");
630 : }
631 :
632 : {
633 23 : auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
634 23 : CPLAddXMLAttributeAndValue(psOption, "name", "GEOMETRY_NAME");
635 23 : CPLAddXMLAttributeAndValue(psOption, "type", "string");
636 23 : CPLAddXMLAttributeAndValue(psOption, "description",
637 : "Name of geometry column");
638 23 : CPLAddXMLAttributeAndValue(psOption, "default", "geometry");
639 : }
640 :
641 : {
642 23 : auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
643 23 : CPLAddXMLAttributeAndValue(psOption, "name", "COORDINATE_PRECISION");
644 23 : CPLAddXMLAttributeAndValue(psOption, "type", "float");
645 23 : CPLAddXMLAttributeAndValue(psOption, "description",
646 : "Number of decimals for coordinates (only "
647 : "for GEOMETRY_ENCODING=WKT)");
648 : }
649 :
650 : {
651 23 : auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
652 23 : CPLAddXMLAttributeAndValue(psOption, "name", "FID");
653 23 : CPLAddXMLAttributeAndValue(psOption, "type", "string");
654 23 : CPLAddXMLAttributeAndValue(psOption, "description",
655 : "Name of the FID column to create");
656 : }
657 :
658 : {
659 23 : auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
660 23 : CPLAddXMLAttributeAndValue(psOption, "name", "POLYGON_ORIENTATION");
661 23 : CPLAddXMLAttributeAndValue(psOption, "type", "string-select");
662 23 : CPLAddXMLAttributeAndValue(
663 : psOption, "description",
664 : "Which ring orientation to use for polygons");
665 23 : CPLAddXMLAttributeAndValue(psOption, "default", "COUNTERCLOCKWISE");
666 23 : CPLCreateXMLElementAndValue(psOption, "Value", "COUNTERCLOCKWISE");
667 23 : CPLCreateXMLElementAndValue(psOption, "Value", "UNMODIFIED");
668 : }
669 :
670 : {
671 23 : auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
672 23 : CPLAddXMLAttributeAndValue(psOption, "name", "EDGES");
673 23 : CPLAddXMLAttributeAndValue(psOption, "type", "string-select");
674 23 : CPLAddXMLAttributeAndValue(
675 : psOption, "description",
676 : "Name of the coordinate system for the edges");
677 23 : CPLAddXMLAttributeAndValue(psOption, "default", "PLANAR");
678 23 : CPLCreateXMLElementAndValue(psOption, "Value", "PLANAR");
679 23 : CPLCreateXMLElementAndValue(psOption, "Value", "SPHERICAL");
680 : }
681 :
682 : {
683 23 : auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
684 23 : CPLAddXMLAttributeAndValue(psOption, "name", "CREATOR");
685 23 : CPLAddXMLAttributeAndValue(psOption, "type", "string");
686 23 : CPLAddXMLAttributeAndValue(psOption, "description",
687 : "Name of creating application");
688 : }
689 :
690 : {
691 23 : auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
692 23 : CPLAddXMLAttributeAndValue(psOption, "name", "WRITE_COVERING_BBOX");
693 23 : CPLAddXMLAttributeAndValue(psOption, "type", "boolean");
694 23 : CPLAddXMLAttributeAndValue(psOption, "default", "YES");
695 23 : CPLAddXMLAttributeAndValue(psOption, "description",
696 : "Whether to write xmin/ymin/xmax/ymax "
697 : "columns with the bounding box of "
698 : "geometries");
699 : }
700 :
701 : {
702 23 : auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
703 23 : CPLAddXMLAttributeAndValue(psOption, "name", "SORT_BY_BBOX");
704 23 : CPLAddXMLAttributeAndValue(psOption, "type", "boolean");
705 23 : CPLAddXMLAttributeAndValue(psOption, "default", "NO");
706 23 : CPLAddXMLAttributeAndValue(psOption, "description",
707 : "Whether features should be sorted based on "
708 : "the bounding box of their geometries");
709 : }
710 :
711 23 : char *pszXML = CPLSerializeXMLTree(oTree.get());
712 23 : GDALDriver::SetMetadataItem(GDAL_DS_LAYER_CREATIONOPTIONLIST, pszXML);
713 23 : CPLFree(pszXML);
714 : }
715 :
716 : /************************************************************************/
717 : /* RegisterOGRParquet() */
718 : /************************************************************************/
719 :
720 36 : void RegisterOGRParquet()
721 : {
722 36 : if (GDALGetDriverByName(DRIVER_NAME) != nullptr)
723 0 : return;
724 :
725 72 : auto poDriver = std::make_unique<OGRParquetDriver>();
726 36 : OGRParquetDriverSetCommonMetadata(poDriver.get());
727 :
728 36 : poDriver->pfnOpen = OGRParquetDriverOpen;
729 36 : poDriver->pfnCreate = OGRParquetDriverCreate;
730 :
731 36 : poDriver->SetMetadataItem("ARROW_VERSION", ARROW_VERSION_STRING);
732 : #ifdef GDAL_USE_ARROWDATASET
733 36 : poDriver->SetMetadataItem("ARROW_DATASET", "YES");
734 : #endif
735 :
736 36 : GetGDALDriverManager()->RegisterDriver(poDriver.release());
737 :
738 : #if ARROW_VERSION_MAJOR >= 16
739 : // Mostly for tests
740 : const char *pszPath =
741 36 : CPLGetConfigOption("OGR_PARQUET_LOAD_FILE_SYSTEM_FACTORIES", nullptr);
742 36 : if (pszPath)
743 : {
744 0 : auto result = arrow::fs::LoadFileSystemFactories(pszPath);
745 0 : if (!result.ok())
746 : {
747 0 : CPLError(CE_Warning, CPLE_AppDefined,
748 : "arrow::fs::LoadFileSystemFactories() failed with %s",
749 0 : result.message().c_str());
750 : }
751 : }
752 : #endif
753 : }
|