LCOV - code coverage report
Current view: top level - ogr/ogrsf_frmts/parquet - ogrparquetdriver.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 289 328 88.1 %
Date: 2025-05-31 00:00:17 Functions: 11 11 100.0 %

          Line data    Source code
       1             : /******************************************************************************
       2             :  *
       3             :  * Project:  Parquet Translator
       4             :  * Purpose:  Implements OGRParquetDriver.
       5             :  * Author:   Even Rouault, <even.rouault at spatialys.com>
       6             :  *
       7             :  ******************************************************************************
       8             :  * Copyright (c) 2022, Planet Labs
       9             :  *
      10             :  * SPDX-License-Identifier: MIT
      11             :  ****************************************************************************/
      12             : 
      13             : #include "gdal_pam.h"
      14             : #include "ogrsf_frmts.h"
      15             : 
      16             : #include <algorithm>
      17             : #include <map>
      18             : #include <mutex>
      19             : #include <tuple>
      20             : 
      21             : #include "ogr_parquet.h"
      22             : #include "ogrparquetdrivercore.h"
      23             : #include "memdataset.h"
      24             : 
      25             : #include "../arrow_common/ograrrowrandomaccessfile.h"
      26             : #include "../arrow_common/vsiarrowfilesystem.hpp"
      27             : #include "../arrow_common/ograrrowwritablefile.h"
      28             : #include "../arrow_common/ograrrowdataset.hpp"
      29             : #include "../arrow_common/ograrrowlayer.hpp"  // for the destructor
      30             : 
      31             : #ifdef GDAL_USE_ARROWDATASET
      32             : 
      33             : /************************************************************************/
      34             : /*                      OpenFromDatasetFactory()                        */
      35             : /************************************************************************/
      36             : 
      37         273 : static GDALDataset *OpenFromDatasetFactory(
      38             :     const std::string &osBasePath,
      39             :     const std::shared_ptr<arrow::dataset::DatasetFactory> &factory,
      40             :     CSLConstList papszOpenOptions,
      41             :     const std::shared_ptr<arrow::fs::FileSystem> &fs)
      42             : {
      43         273 :     std::shared_ptr<arrow::dataset::Dataset> dataset;
      44         546 :     PARQUET_ASSIGN_OR_THROW(dataset, factory->Finish());
      45             : 
      46             :     auto poMemoryPool = std::shared_ptr<arrow::MemoryPool>(
      47         546 :         arrow::MemoryPool::CreateDefault().release());
      48             : 
      49         273 :     const bool bIsVSI = STARTS_WITH(osBasePath.c_str(), "/vsi");
      50         546 :     auto poDS = std::make_unique<OGRParquetDataset>(poMemoryPool);
      51             :     auto poLayer = std::make_unique<OGRParquetDatasetLayer>(
      52         546 :         poDS.get(), CPLGetBasenameSafe(osBasePath.c_str()).c_str(), bIsVSI,
      53         546 :         dataset, papszOpenOptions);
      54         273 :     poDS->SetLayer(std::move(poLayer));
      55         273 :     poDS->SetFileSystem(fs);
      56         546 :     return poDS.release();
      57             : }
      58             : 
      59             : /************************************************************************/
      60             : /*                         GetFileSystem()                              */
      61             : /************************************************************************/
      62             : 
      63             : static std::tuple<std::shared_ptr<arrow::fs::FileSystem>, std::string>
      64         273 : GetFileSystem(std::string &osBasePathInOut,
      65             :               const std::string &osQueryParameters)
      66             : {
      67             :     // Instantiate file system:
      68             :     // - VSIArrowFileSystem implementation for /vsi files
      69             :     // - base implementation for local files (if OGR_PARQUET_USE_VSI set to NO)
      70         273 :     std::shared_ptr<arrow::fs::FileSystem> fs;
      71         273 :     const bool bIsVSI = STARTS_WITH(osBasePathInOut.c_str(), "/vsi");
      72             :     VSIStatBufL sStat;
      73         546 :     std::string osFSFilename;
      74         459 :     if ((bIsVSI ||
      75         538 :          CPLTestBool(CPLGetConfigOption("OGR_PARQUET_USE_VSI", "YES"))) &&
      76         265 :         VSIStatL(osBasePathInOut.c_str(), &sStat) == 0)
      77             :     {
      78         264 :         osFSFilename = osBasePathInOut;
      79         264 :         fs = std::make_shared<VSIArrowFileSystem>("PARQUET", osQueryParameters);
      80             :     }
      81             :     else
      82             :     {
      83             :         // FileSystemFromUriOrPath() doesn't like relative paths
      84             :         // so transform them to absolute.
      85           9 :         std::string osPath(osBasePathInOut);
      86           9 :         if (CPLIsFilenameRelative(osPath.c_str()))
      87             :         {
      88           8 :             char *pszCurDir = CPLGetCurrentDir();
      89           8 :             if (pszCurDir == nullptr)
      90           0 :                 return {nullptr, osFSFilename};
      91           8 :             osPath = CPLFormFilenameSafe(pszCurDir, osPath.c_str(), nullptr);
      92           8 :             CPLFree(pszCurDir);
      93             :         }
      94           9 :         PARQUET_ASSIGN_OR_THROW(
      95             :             fs, arrow::fs::FileSystemFromUriOrPath(osPath, &osFSFilename));
      96             :     }
      97         273 :     return {fs, osFSFilename};
      98             : }
      99             : 
     100             : /************************************************************************/
     101             : /*                  OpenParquetDatasetWithMetadata()                    */
     102             : /************************************************************************/
     103             : 
     104          18 : static GDALDataset *OpenParquetDatasetWithMetadata(
     105             :     const std::string &osBasePathIn, const char *pszMetadataFile,
     106             :     const std::string &osQueryParameters, CSLConstList papszOpenOptions)
     107             : {
     108          36 :     std::string osBasePath(osBasePathIn);
     109          18 :     const auto &[fs, osFSFilename] =
     110          36 :         GetFileSystem(osBasePath, osQueryParameters);
     111             : 
     112          36 :     arrow::dataset::ParquetFactoryOptions options;
     113          36 :     auto partitioningFactory = arrow::dataset::HivePartitioning::MakeFactory();
     114             :     options.partitioning =
     115          18 :         arrow::dataset::PartitioningOrFactory(std::move(partitioningFactory));
     116             : 
     117          18 :     std::shared_ptr<arrow::dataset::DatasetFactory> factory;
     118             :     // coverity[copy_constructor_call]
     119          54 :     PARQUET_ASSIGN_OR_THROW(
     120             :         factory, arrow::dataset::ParquetDatasetFactory::Make(
     121             :                      osFSFilename + '/' + pszMetadataFile, fs,
     122             :                      std::make_shared<arrow::dataset::ParquetFileFormat>(),
     123             :                      std::move(options)));
     124             : 
     125          36 :     return OpenFromDatasetFactory(osBasePath, factory, papszOpenOptions, fs);
     126             : }
     127             : 
     128             : /************************************************************************/
     129             : /*                 OpenParquetDatasetWithoutMetadata()                  */
     130             : /************************************************************************/
     131             : 
     132             : static GDALDataset *
     133         255 : OpenParquetDatasetWithoutMetadata(const std::string &osBasePathIn,
     134             :                                   const std::string &osQueryParameters,
     135             :                                   CSLConstList papszOpenOptions)
     136             : {
     137         510 :     std::string osBasePath(osBasePathIn);
     138         255 :     const auto &[fs, osFSFilename] =
     139         510 :         GetFileSystem(osBasePath, osQueryParameters);
     140             : 
     141         510 :     arrow::dataset::FileSystemFactoryOptions options;
     142         255 :     std::shared_ptr<arrow::dataset::DatasetFactory> factory;
     143             : 
     144         510 :     const auto fileInfo = fs->GetFileInfo(osFSFilename);
     145         255 :     if (fileInfo->IsFile())
     146             :     {
     147             :         // coverity[copy_constructor_call]
     148        1008 :         PARQUET_ASSIGN_OR_THROW(
     149             :             factory, arrow::dataset::FileSystemDatasetFactory::Make(
     150             :                          fs, {std::move(osFSFilename)},
     151             :                          std::make_shared<arrow::dataset::ParquetFileFormat>(),
     152             :                          std::move(options)));
     153             :     }
     154             :     else
     155             :     {
     156             :         auto partitioningFactory =
     157           6 :             arrow::dataset::HivePartitioning::MakeFactory();
     158           6 :         options.partitioning = arrow::dataset::PartitioningOrFactory(
     159           6 :             std::move(partitioningFactory));
     160             : 
     161           6 :         arrow::fs::FileSelector selector;
     162           3 :         selector.base_dir = std::move(osFSFilename);
     163           3 :         selector.recursive = true;
     164             : 
     165             :         // coverity[copy_constructor_call]
     166           6 :         PARQUET_ASSIGN_OR_THROW(
     167             :             factory, arrow::dataset::FileSystemDatasetFactory::Make(
     168             :                          fs, std::move(selector),
     169             :                          std::make_shared<arrow::dataset::ParquetFileFormat>(),
     170             :                          std::move(options)));
     171             :     }
     172             : 
     173         510 :     return OpenFromDatasetFactory(osBasePath, factory, papszOpenOptions, fs);
     174             : }
     175             : 
     176             : #endif
     177             : 
     178             : /************************************************************************/
     179             : /*                  BuildMemDatasetWithRowGroupExtents()                */
     180             : /************************************************************************/
     181             : 
     182             : /** Builds a MEM dataset that contains, for each row-group of the input file,
     183             :  * the feature count and spatial extent of the features of this row group,
     184             :  * using Parquet statistics. This assumes that the Parquet file declares
     185             :  * a "covering":{"bbox":{ ... }} metadata item.
     186             :  *
     187             :  * Only for debug purposes.
     188             :  */
     189           1 : static GDALDataset *BuildMemDatasetWithRowGroupExtents(OGRParquetLayer *poLayer)
     190             : {
     191           1 :     int iParquetXMin = -1;
     192           1 :     int iParquetYMin = -1;
     193           1 :     int iParquetXMax = -1;
     194           1 :     int iParquetYMax = -1;
     195           1 :     if (poLayer->GeomColsBBOXParquet(0, iParquetXMin, iParquetYMin,
     196             :                                      iParquetXMax, iParquetYMax))
     197             :     {
     198             :         auto poMemDS = std::unique_ptr<GDALDataset>(
     199           2 :             MEMDataset::Create("", 0, 0, 0, GDT_Unknown, nullptr));
     200           1 :         if (!poMemDS)
     201           0 :             return nullptr;
     202           1 :         OGRSpatialReference *poTmpSRS = nullptr;
     203           1 :         const auto poSrcSRS = poLayer->GetSpatialRef();
     204           1 :         if (poSrcSRS)
     205           0 :             poTmpSRS = poSrcSRS->Clone();
     206             :         auto poMemLayer =
     207           1 :             poMemDS->CreateLayer("footprint", poTmpSRS, wkbPolygon, nullptr);
     208           1 :         if (poTmpSRS)
     209           0 :             poTmpSRS->Release();
     210           1 :         if (!poMemLayer)
     211           0 :             return nullptr;
     212           1 :         poMemLayer->CreateField(
     213           1 :             std::make_unique<OGRFieldDefn>("feature_count", OFTInteger64)
     214           1 :                 .get());
     215             : 
     216             :         const auto metadata =
     217           2 :             poLayer->GetReader()->parquet_reader()->metadata();
     218           1 :         const int numRowGroups = metadata->num_row_groups();
     219          15 :         for (int iRowGroup = 0; iRowGroup < numRowGroups; ++iRowGroup)
     220             :         {
     221          28 :             std::string osMinTmp, osMaxTmp;
     222             :             OGRField unusedF;
     223             :             bool unusedB;
     224             :             OGRFieldSubType unusedSubType;
     225             : 
     226             :             OGRField sXMin;
     227          14 :             OGR_RawField_SetNull(&sXMin);
     228          14 :             bool bFoundXMin = false;
     229          14 :             OGRFieldType eXMinType = OFTMaxType;
     230             : 
     231             :             OGRField sYMin;
     232          14 :             OGR_RawField_SetNull(&sYMin);
     233          14 :             bool bFoundYMin = false;
     234          14 :             OGRFieldType eYMinType = OFTMaxType;
     235             : 
     236             :             OGRField sXMax;
     237          14 :             OGR_RawField_SetNull(&sXMax);
     238          14 :             bool bFoundXMax = false;
     239          14 :             OGRFieldType eXMaxType = OFTMaxType;
     240             : 
     241             :             OGRField sYMax;
     242          14 :             OGR_RawField_SetNull(&sYMax);
     243          14 :             bool bFoundYMax = false;
     244          14 :             OGRFieldType eYMaxType = OFTMaxType;
     245             : 
     246          14 :             if (poLayer->GetMinMaxForParquetCol(
     247             :                     iRowGroup, iParquetXMin, nullptr,
     248             :                     /* bComputeMin = */ true, sXMin, bFoundXMin,
     249             :                     /* bComputeMax = */ false, unusedF, unusedB, eXMinType,
     250           8 :                     unusedSubType, osMinTmp, osMaxTmp) &&
     251           8 :                 bFoundXMin && eXMinType == OFTReal &&
     252          22 :                 poLayer->GetMinMaxForParquetCol(
     253             :                     iRowGroup, iParquetYMin, nullptr,
     254             :                     /* bComputeMin = */ true, sYMin, bFoundYMin,
     255             :                     /* bComputeMax = */ false, unusedF, unusedB, eYMinType,
     256           8 :                     unusedSubType, osMinTmp, osMaxTmp) &&
     257           8 :                 bFoundYMin && eYMinType == OFTReal &&
     258          22 :                 poLayer->GetMinMaxForParquetCol(
     259             :                     iRowGroup, iParquetXMax, nullptr,
     260             :                     /* bComputeMin = */ false, unusedF, unusedB,
     261             :                     /* bComputeMax = */ true, sXMax, bFoundXMax, eXMaxType,
     262           8 :                     unusedSubType, osMaxTmp, osMaxTmp) &&
     263           8 :                 bFoundXMax && eXMaxType == OFTReal &&
     264          22 :                 poLayer->GetMinMaxForParquetCol(
     265             :                     iRowGroup, iParquetYMax, nullptr,
     266             :                     /* bComputeMin = */ false, unusedF, unusedB,
     267             :                     /* bComputeMax = */ true, sYMax, bFoundYMax, eYMaxType,
     268           8 :                     unusedSubType, osMaxTmp, osMaxTmp) &&
     269          22 :                 bFoundYMax && eYMaxType == OFTReal)
     270             :             {
     271          16 :                 OGRFeature oFeat(poMemLayer->GetLayerDefn());
     272           8 :                 oFeat.SetField(0,
     273             :                                static_cast<GIntBig>(
     274           8 :                                    metadata->RowGroup(iRowGroup)->num_rows()));
     275          16 :                 auto poPoly = std::make_unique<OGRPolygon>();
     276           8 :                 auto poLR = std::make_unique<OGRLinearRing>();
     277           8 :                 poLR->addPoint(sXMin.Real, sYMin.Real);
     278           8 :                 poLR->addPoint(sXMin.Real, sYMax.Real);
     279           8 :                 poLR->addPoint(sXMax.Real, sYMax.Real);
     280           8 :                 poLR->addPoint(sXMax.Real, sYMin.Real);
     281           8 :                 poLR->addPoint(sXMin.Real, sYMin.Real);
     282           8 :                 poPoly->addRingDirectly(poLR.release());
     283           8 :                 oFeat.SetGeometryDirectly(poPoly.release());
     284           8 :                 CPL_IGNORE_RET_VAL(poMemLayer->CreateFeature(&oFeat));
     285             :             }
     286             :         }
     287             : 
     288           1 :         return poMemDS.release();
     289             :     }
     290           0 :     return nullptr;
     291             : }
     292             : 
     293             : /************************************************************************/
     294             : /*                                Open()                                */
     295             : /************************************************************************/
     296             : 
     297        1664 : static GDALDataset *OGRParquetDriverOpen(GDALOpenInfo *poOpenInfo)
     298             : {
     299        1664 :     if (poOpenInfo->eAccess == GA_Update)
     300          62 :         return nullptr;
     301             : 
     302             : #ifdef GDAL_USE_ARROWDATASET
     303        3204 :     std::string osBasePath(poOpenInfo->pszFilename);
     304        3204 :     std::string osQueryParameters;
     305             :     const bool bStartedWithParquetPrefix =
     306        1602 :         STARTS_WITH(osBasePath.c_str(), "PARQUET:");
     307             : 
     308        1602 :     if (bStartedWithParquetPrefix)
     309             :     {
     310         261 :         osBasePath = osBasePath.substr(strlen("PARQUET:"));
     311             :     }
     312             : 
     313             :     // Little trick to allow using syntax of
     314             :     // https://github.com/opengeospatial/geoparquet/discussions/101
     315             :     // ogrinfo
     316             :     // "/vsicurl/https://ai4edataeuwest.blob.core.windows.net/us-census/2020/cb_2020_us_vtd_500k.parquet?${SAS_TOKEN}"
     317        1602 :     if (STARTS_WITH(osBasePath.c_str(), "/vsicurl/"))
     318             :     {
     319           2 :         const auto nPos = osBasePath.find(".parquet?st=");
     320           2 :         if (nPos != std::string::npos)
     321             :         {
     322           0 :             osQueryParameters = osBasePath.substr(nPos + strlen(".parquet"));
     323           0 :             osBasePath.resize(nPos + strlen(".parquet"));
     324             :         }
     325             :     }
     326             : 
     327        2465 :     if (bStartedWithParquetPrefix || poOpenInfo->bIsDirectory ||
     328         863 :         !osQueryParameters.empty())
     329             :     {
     330             :         VSIStatBufL sStat;
     331         739 :         if (!osBasePath.empty() && osBasePath.back() == '/')
     332           0 :             osBasePath.pop_back();
     333             :         const std::string osMetadataPath =
     334         739 :             CPLFormFilenameSafe(osBasePath.c_str(), "_metadata", nullptr);
     335         739 :         if (CPLTestBool(
     336        2217 :                 CPLGetConfigOption("OGR_PARQUET_USE_METADATA_FILE", "YES")) &&
     337        1478 :             VSIStatL((osMetadataPath + osQueryParameters).c_str(), &sStat) == 0)
     338             :         {
     339             :             // If there's a _metadata file, then use it to avoid listing files
     340             :             try
     341             :             {
     342          36 :                 return OpenParquetDatasetWithMetadata(
     343             :                     osBasePath, "_metadata", osQueryParameters,
     344          18 :                     poOpenInfo->papszOpenOptions);
     345             :             }
     346           0 :             catch (const std::exception &e)
     347             :             {
     348           0 :                 CPLError(CE_Failure, CPLE_AppDefined, "Parquet exception: %s",
     349           0 :                          e.what());
     350             :             }
     351           0 :             return nullptr;
     352             :         }
     353             :         else
     354             :         {
     355         721 :             bool bLikelyParquetDataset = false;
     356         721 :             if (poOpenInfo->bIsDirectory)
     357             :             {
     358             :                 // Detect if the directory contains .parquet files, or
     359             :                 // subdirectories with a name of the form "key=value", typical
     360             :                 // of HIVE partitioning.
     361         936 :                 const CPLStringList aosFiles(VSIReadDir(osBasePath.c_str()));
     362       22205 :                 for (const char *pszFilename : cpl::Iterate(aosFiles))
     363             :                 {
     364       21739 :                     if (EQUAL(CPLGetExtensionSafe(pszFilename).c_str(),
     365             :                               "parquet"))
     366             :                     {
     367           2 :                         bLikelyParquetDataset = true;
     368           2 :                         break;
     369             :                     }
     370       21737 :                     else if (strchr(pszFilename, '='))
     371             :                     {
     372             :                         // HIVE partitioning
     373           0 :                         if (VSIStatL(CPLFormFilenameSafe(osBasePath.c_str(),
     374             :                                                          pszFilename, nullptr)
     375             :                                          .c_str(),
     376           0 :                                      &sStat) == 0 &&
     377           0 :                             VSI_ISDIR(sStat.st_mode))
     378             :                         {
     379           0 :                             bLikelyParquetDataset = true;
     380           0 :                             break;
     381             :                         }
     382             :                     }
     383             :                 }
     384             :             }
     385             : 
     386         721 :             if (bStartedWithParquetPrefix || bLikelyParquetDataset)
     387             :             {
     388             :                 try
     389             :                 {
     390         510 :                     return OpenParquetDatasetWithoutMetadata(
     391             :                         osBasePath, osQueryParameters,
     392         255 :                         poOpenInfo->papszOpenOptions);
     393             :                 }
     394           0 :                 catch (const std::exception &e)
     395             :                 {
     396             :                     // If we aren't quite sure that the passed file name is
     397             :                     // a directory, then silently continue
     398           0 :                     if (poOpenInfo->bIsDirectory)
     399             :                     {
     400           0 :                         CPLError(CE_Failure, CPLE_AppDefined,
     401           0 :                                  "Parquet exception: %s", e.what());
     402           0 :                         return nullptr;
     403             :                     }
     404             :                 }
     405             :             }
     406             :         }
     407             :     }
     408             : #endif
     409             : 
     410        1329 :     if (!OGRParquetDriverIdentify(poOpenInfo))
     411             :     {
     412           0 :         return nullptr;
     413             :     }
     414             : 
     415        1329 :     if (poOpenInfo->bIsDirectory)
     416         466 :         return nullptr;
     417             : 
     418        1726 :     std::string osFilename(poOpenInfo->pszFilename);
     419         863 :     if (STARTS_WITH(poOpenInfo->pszFilename, "PARQUET:"))
     420             :     {
     421           0 :         osFilename = poOpenInfo->pszFilename + strlen("PARQUET:");
     422             :     }
     423             : 
     424             :     try
     425             :     {
     426         863 :         std::shared_ptr<arrow::io::RandomAccessFile> infile;
     427        1341 :         if (STARTS_WITH(osFilename.c_str(), "/vsi") ||
     428         478 :             CPLTestBool(CPLGetConfigOption("OGR_PARQUET_USE_VSI", "NO")))
     429             :         {
     430         385 :             VSIVirtualHandleUniquePtr fp(poOpenInfo->fpL);
     431         385 :             poOpenInfo->fpL = nullptr;
     432         385 :             if (fp == nullptr)
     433             :             {
     434           0 :                 fp.reset(VSIFOpenL(osFilename.c_str(), "rb"));
     435           0 :                 if (fp == nullptr)
     436           0 :                     return nullptr;
     437             :             }
     438         770 :             infile = std::make_shared<OGRArrowRandomAccessFile>(osFilename,
     439         770 :                                                                 std::move(fp));
     440             :         }
     441             :         else
     442             :         {
     443         478 :             PARQUET_ASSIGN_OR_THROW(infile,
     444             :                                     arrow::io::ReadableFile::Open(osFilename));
     445             :         }
     446             : 
     447             :         // Open Parquet file reader
     448         863 :         std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
     449             :         auto poMemoryPool = std::shared_ptr<arrow::MemoryPool>(
     450        1726 :             arrow::MemoryPool::CreateDefault().release());
     451             : #if ARROW_VERSION_MAJOR >= 19
     452        2589 :         PARQUET_ASSIGN_OR_THROW(
     453             :             arrow_reader,
     454             :             parquet::arrow::OpenFile(std::move(infile), poMemoryPool.get()));
     455             : #else
     456             :         auto st = parquet::arrow::OpenFile(std::move(infile),
     457             :                                            poMemoryPool.get(), &arrow_reader);
     458             :         if (!st.ok())
     459             :         {
     460             :             CPLError(CE_Failure, CPLE_AppDefined,
     461             :                      "parquet::arrow::OpenFile() failed");
     462             :             return nullptr;
     463             :         }
     464             : #endif
     465             : 
     466        1724 :         auto poDS = std::make_unique<OGRParquetDataset>(poMemoryPool);
     467             :         auto poLayer = std::make_unique<OGRParquetLayer>(
     468        1724 :             poDS.get(), CPLGetBasenameSafe(osFilename.c_str()).c_str(),
     469        2586 :             std::move(arrow_reader), poOpenInfo->papszOpenOptions);
     470             : 
     471             :         // For debug purposes: return a layer with the extent of each row group
     472         862 :         if (CPLTestBool(
     473             :                 CPLGetConfigOption("OGR_PARQUET_SHOW_ROW_GROUP_EXTENT", "NO")))
     474             :         {
     475           1 :             return BuildMemDatasetWithRowGroupExtents(poLayer.get());
     476             :         }
     477             : 
     478         861 :         poDS->SetLayer(std::move(poLayer));
     479         861 :         return poDS.release();
     480             :     }
     481           1 :     catch (const std::exception &e)
     482             :     {
     483           1 :         CPLError(CE_Failure, CPLE_AppDefined, "Parquet exception: %s",
     484           1 :                  e.what());
     485           1 :         return nullptr;
     486             :     }
     487             : }
     488             : 
     489             : /************************************************************************/
     490             : /*                               Create()                               */
     491             : /************************************************************************/
     492             : 
     493         269 : static GDALDataset *OGRParquetDriverCreate(const char *pszName, int nXSize,
     494             :                                            int nYSize, int nBands,
     495             :                                            GDALDataType eType,
     496             :                                            char ** /* papszOptions */)
     497             : {
     498         269 :     if (!(nXSize == 0 && nYSize == 0 && nBands == 0 && eType == GDT_Unknown))
     499           0 :         return nullptr;
     500             : 
     501             :     try
     502             :     {
     503         269 :         std::shared_ptr<arrow::io::OutputStream> out_file;
     504         351 :         if (STARTS_WITH(pszName, "/vsi") ||
     505          82 :             CPLTestBool(CPLGetConfigOption("OGR_PARQUET_USE_VSI", "YES")))
     506             :         {
     507         269 :             VSILFILE *fp = VSIFOpenL(pszName, "wb");
     508         269 :             if (fp == nullptr)
     509             :             {
     510           1 :                 CPLError(CE_Failure, CPLE_FileIO, "Cannot create %s", pszName);
     511           1 :                 return nullptr;
     512             :             }
     513         268 :             out_file = std::make_shared<OGRArrowWritableFile>(fp);
     514             :         }
     515             :         else
     516             :         {
     517           0 :             PARQUET_ASSIGN_OR_THROW(out_file,
     518             :                                     arrow::io::FileOutputStream::Open(pszName));
     519             :         }
     520             : 
     521         268 :         return new OGRParquetWriterDataset(out_file);
     522             :     }
     523           0 :     catch (const std::exception &e)
     524             :     {
     525           0 :         CPLError(CE_Failure, CPLE_AppDefined, "Parquet exception: %s",
     526           0 :                  e.what());
     527           0 :         return nullptr;
     528             :     }
     529             : }
     530             : 
     531             : /************************************************************************/
     532             : /*                         OGRParquetDriver()                           */
     533             : /************************************************************************/
     534             : 
     535             : class OGRParquetDriver final : public GDALDriver
     536             : {
     537             :     std::mutex m_oMutex{};
     538             :     bool m_bMetadataInitialized = false;
     539             :     void InitMetadata();
     540             : 
     541             :   public:
     542        1990 :     const char *GetMetadataItem(const char *pszName,
     543             :                                 const char *pszDomain) override
     544             :     {
     545        3980 :         std::lock_guard oLock(m_oMutex);
     546        1990 :         if (EQUAL(pszName, GDAL_DS_LAYER_CREATIONOPTIONLIST))
     547             :         {
     548         301 :             InitMetadata();
     549             :         }
     550        3980 :         return GDALDriver::GetMetadataItem(pszName, pszDomain);
     551             :     }
     552             : 
     553          51 :     char **GetMetadata(const char *pszDomain) override
     554             :     {
     555         102 :         std::lock_guard oLock(m_oMutex);
     556          51 :         InitMetadata();
     557         102 :         return GDALDriver::GetMetadata(pszDomain);
     558             :     }
     559             : };
     560             : 
     561         352 : void OGRParquetDriver::InitMetadata()
     562             : {
     563         352 :     if (m_bMetadataInitialized)
     564         329 :         return;
     565          23 :     m_bMetadataInitialized = true;
     566             : 
     567             :     CPLXMLTreeCloser oTree(
     568          46 :         CPLCreateXMLNode(nullptr, CXT_Element, "LayerCreationOptionList"));
     569             : 
     570          46 :     std::vector<const char *> apszCompressionMethods;
     571          23 :     bool bHasSnappy = false;
     572         161 :     for (const char *pszMethod :
     573         184 :          {"SNAPPY", "GZIP", "BROTLI", "ZSTD", "LZ4_RAW", "LZO", "LZ4_HADOOP"})
     574             :     {
     575             :         auto oResult = arrow::util::Codec::GetCompressionType(
     576         322 :             CPLString(pszMethod).tolower());
     577         161 :         if (oResult.ok() && arrow::util::Codec::IsAvailable(*oResult))
     578             :         {
     579         138 :             if (EQUAL(pszMethod, "SNAPPY"))
     580          23 :                 bHasSnappy = true;
     581         138 :             apszCompressionMethods.emplace_back(pszMethod);
     582             :         }
     583             :     }
     584             : 
     585             :     {
     586          23 :         auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
     587          23 :         CPLAddXMLAttributeAndValue(psOption, "name", "COMPRESSION");
     588          23 :         CPLAddXMLAttributeAndValue(psOption, "type", "string-select");
     589          23 :         CPLAddXMLAttributeAndValue(psOption, "description",
     590             :                                    "Compression method");
     591          23 :         CPLAddXMLAttributeAndValue(psOption, "default",
     592             :                                    bHasSnappy ? "SNAPPY" : "NONE");
     593             :         {
     594          23 :             auto poValueNode = CPLCreateXMLNode(psOption, CXT_Element, "Value");
     595          23 :             CPLAddXMLAttributeAndValue(poValueNode, "alias", "UNCOMPRESSED");
     596          23 :             CPLCreateXMLNode(poValueNode, CXT_Text, "NONE");
     597             :         }
     598         161 :         for (const char *pszMethod : apszCompressionMethods)
     599             :         {
     600         138 :             auto poValueNode = CPLCreateXMLNode(psOption, CXT_Element, "Value");
     601         138 :             CPLCreateXMLNode(poValueNode, CXT_Text, pszMethod);
     602             :         }
     603             :     }
     604             : 
     605             :     {
     606          23 :         auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
     607          23 :         CPLAddXMLAttributeAndValue(psOption, "name", "GEOMETRY_ENCODING");
     608          23 :         CPLAddXMLAttributeAndValue(psOption, "type", "string-select");
     609          23 :         CPLAddXMLAttributeAndValue(psOption, "description",
     610             :                                    "Encoding of geometry columns");
     611          23 :         CPLAddXMLAttributeAndValue(psOption, "default", "WKB");
     612          92 :         for (const char *pszEncoding :
     613         115 :              {"WKB", "WKT", "GEOARROW", "GEOARROW_INTERLEAVED"})
     614             :         {
     615          92 :             auto poValueNode = CPLCreateXMLNode(psOption, CXT_Element, "Value");
     616          92 :             CPLCreateXMLNode(poValueNode, CXT_Text, pszEncoding);
     617          92 :             if (EQUAL(pszEncoding, "GEOARROW"))
     618          23 :                 CPLAddXMLAttributeAndValue(poValueNode, "alias",
     619             :                                            "GEOARROW_STRUCT");
     620             :         }
     621             :     }
     622             : 
     623             :     {
     624          23 :         auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
     625          23 :         CPLAddXMLAttributeAndValue(psOption, "name", "ROW_GROUP_SIZE");
     626          23 :         CPLAddXMLAttributeAndValue(psOption, "type", "integer");
     627          23 :         CPLAddXMLAttributeAndValue(psOption, "description",
     628             :                                    "Maximum number of rows per group");
     629          23 :         CPLAddXMLAttributeAndValue(psOption, "default", "65536");
     630             :     }
     631             : 
     632             :     {
     633          23 :         auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
     634          23 :         CPLAddXMLAttributeAndValue(psOption, "name", "GEOMETRY_NAME");
     635          23 :         CPLAddXMLAttributeAndValue(psOption, "type", "string");
     636          23 :         CPLAddXMLAttributeAndValue(psOption, "description",
     637             :                                    "Name of geometry column");
     638          23 :         CPLAddXMLAttributeAndValue(psOption, "default", "geometry");
     639             :     }
     640             : 
     641             :     {
     642          23 :         auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
     643          23 :         CPLAddXMLAttributeAndValue(psOption, "name", "COORDINATE_PRECISION");
     644          23 :         CPLAddXMLAttributeAndValue(psOption, "type", "float");
     645          23 :         CPLAddXMLAttributeAndValue(psOption, "description",
     646             :                                    "Number of decimals for coordinates (only "
     647             :                                    "for GEOMETRY_ENCODING=WKT)");
     648             :     }
     649             : 
     650             :     {
     651          23 :         auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
     652          23 :         CPLAddXMLAttributeAndValue(psOption, "name", "FID");
     653          23 :         CPLAddXMLAttributeAndValue(psOption, "type", "string");
     654          23 :         CPLAddXMLAttributeAndValue(psOption, "description",
     655             :                                    "Name of the FID column to create");
     656             :     }
     657             : 
     658             :     {
     659          23 :         auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
     660          23 :         CPLAddXMLAttributeAndValue(psOption, "name", "POLYGON_ORIENTATION");
     661          23 :         CPLAddXMLAttributeAndValue(psOption, "type", "string-select");
     662          23 :         CPLAddXMLAttributeAndValue(
     663             :             psOption, "description",
     664             :             "Which ring orientation to use for polygons");
     665          23 :         CPLAddXMLAttributeAndValue(psOption, "default", "COUNTERCLOCKWISE");
     666          23 :         CPLCreateXMLElementAndValue(psOption, "Value", "COUNTERCLOCKWISE");
     667          23 :         CPLCreateXMLElementAndValue(psOption, "Value", "UNMODIFIED");
     668             :     }
     669             : 
     670             :     {
     671          23 :         auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
     672          23 :         CPLAddXMLAttributeAndValue(psOption, "name", "EDGES");
     673          23 :         CPLAddXMLAttributeAndValue(psOption, "type", "string-select");
     674          23 :         CPLAddXMLAttributeAndValue(
     675             :             psOption, "description",
     676             :             "Name of the coordinate system for the edges");
     677          23 :         CPLAddXMLAttributeAndValue(psOption, "default", "PLANAR");
     678          23 :         CPLCreateXMLElementAndValue(psOption, "Value", "PLANAR");
     679          23 :         CPLCreateXMLElementAndValue(psOption, "Value", "SPHERICAL");
     680             :     }
     681             : 
     682             :     {
     683          23 :         auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
     684          23 :         CPLAddXMLAttributeAndValue(psOption, "name", "CREATOR");
     685          23 :         CPLAddXMLAttributeAndValue(psOption, "type", "string");
     686          23 :         CPLAddXMLAttributeAndValue(psOption, "description",
     687             :                                    "Name of creating application");
     688             :     }
     689             : 
     690             :     {
     691          23 :         auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
     692          23 :         CPLAddXMLAttributeAndValue(psOption, "name", "WRITE_COVERING_BBOX");
     693          23 :         CPLAddXMLAttributeAndValue(psOption, "type", "boolean");
     694          23 :         CPLAddXMLAttributeAndValue(psOption, "default", "YES");
     695          23 :         CPLAddXMLAttributeAndValue(psOption, "description",
     696             :                                    "Whether to write xmin/ymin/xmax/ymax "
     697             :                                    "columns with the bounding box of "
     698             :                                    "geometries");
     699             :     }
     700             : 
     701             :     {
     702          23 :         auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
     703          23 :         CPLAddXMLAttributeAndValue(psOption, "name", "SORT_BY_BBOX");
     704          23 :         CPLAddXMLAttributeAndValue(psOption, "type", "boolean");
     705          23 :         CPLAddXMLAttributeAndValue(psOption, "default", "NO");
     706          23 :         CPLAddXMLAttributeAndValue(psOption, "description",
     707             :                                    "Whether features should be sorted based on "
     708             :                                    "the bounding box of their geometries");
     709             :     }
     710             : 
     711          23 :     char *pszXML = CPLSerializeXMLTree(oTree.get());
     712          23 :     GDALDriver::SetMetadataItem(GDAL_DS_LAYER_CREATIONOPTIONLIST, pszXML);
     713          23 :     CPLFree(pszXML);
     714             : }
     715             : 
     716             : /************************************************************************/
     717             : /*                         RegisterOGRParquet()                         */
     718             : /************************************************************************/
     719             : 
     720          36 : void RegisterOGRParquet()
     721             : {
     722          36 :     if (GDALGetDriverByName(DRIVER_NAME) != nullptr)
     723           0 :         return;
     724             : 
     725          72 :     auto poDriver = std::make_unique<OGRParquetDriver>();
     726          36 :     OGRParquetDriverSetCommonMetadata(poDriver.get());
     727             : 
     728          36 :     poDriver->pfnOpen = OGRParquetDriverOpen;
     729          36 :     poDriver->pfnCreate = OGRParquetDriverCreate;
     730             : 
     731          36 :     poDriver->SetMetadataItem("ARROW_VERSION", ARROW_VERSION_STRING);
     732             : #ifdef GDAL_USE_ARROWDATASET
     733          36 :     poDriver->SetMetadataItem("ARROW_DATASET", "YES");
     734             : #endif
     735             : 
     736          36 :     GetGDALDriverManager()->RegisterDriver(poDriver.release());
     737             : 
     738             : #if ARROW_VERSION_MAJOR >= 16
     739             :     // Mostly for tests
     740             :     const char *pszPath =
     741          36 :         CPLGetConfigOption("OGR_PARQUET_LOAD_FILE_SYSTEM_FACTORIES", nullptr);
     742          36 :     if (pszPath)
     743             :     {
     744           0 :         auto result = arrow::fs::LoadFileSystemFactories(pszPath);
     745           0 :         if (!result.ok())
     746             :         {
     747           0 :             CPLError(CE_Warning, CPLE_AppDefined,
     748             :                      "arrow::fs::LoadFileSystemFactories() failed with %s",
     749           0 :                      result.message().c_str());
     750             :         }
     751             :     }
     752             : #endif
     753             : }

Generated by: LCOV version 1.14