LCOV - code coverage report
Current view: top level - ogr/ogrsf_frmts/parquet - ogrparquetdriver.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 289 333 86.8 %
Date: 2024-11-21 22:18:42 Functions: 11 11 100.0 %

          Line data    Source code
       1             : /******************************************************************************
       2             :  *
       3             :  * Project:  Parquet Translator
       4             :  * Purpose:  Implements OGRParquetDriver.
       5             :  * Author:   Even Rouault, <even.rouault at spatialys.com>
       6             :  *
       7             :  ******************************************************************************
       8             :  * Copyright (c) 2022, Planet Labs
       9             :  *
      10             :  * SPDX-License-Identifier: MIT
      11             :  ****************************************************************************/
      12             : 
      13             : #include "gdal_pam.h"
      14             : #include "ogrsf_frmts.h"
      15             : 
      16             : #include <algorithm>
      17             : #include <map>
      18             : #include <tuple>
      19             : 
      20             : #include "ogr_parquet.h"
      21             : #include "ogrparquetdrivercore.h"
      22             : 
      23             : #include "../arrow_common/ograrrowrandomaccessfile.h"
      24             : #include "../arrow_common/vsiarrowfilesystem.hpp"
      25             : #include "../arrow_common/ograrrowwritablefile.h"
      26             : #include "../arrow_common/ograrrowdataset.hpp"
      27             : #include "../arrow_common/ograrrowlayer.hpp"  // for the destructor
      28             : 
      29             : #ifdef GDAL_USE_ARROWDATASET
      30             : 
      31             : /************************************************************************/
      32             : /*                      OpenFromDatasetFactory()                        */
      33             : /************************************************************************/
      34             : 
      35         272 : static GDALDataset *OpenFromDatasetFactory(
      36             :     const std::string &osBasePath,
      37             :     const std::shared_ptr<arrow::dataset::DatasetFactory> &factory,
      38             :     CSLConstList papszOpenOptions,
      39             :     const std::shared_ptr<arrow::fs::FileSystem> &fs)
      40             : {
      41         272 :     std::shared_ptr<arrow::dataset::Dataset> dataset;
      42         544 :     PARQUET_ASSIGN_OR_THROW(dataset, factory->Finish());
      43             : 
      44             :     auto poMemoryPool = std::shared_ptr<arrow::MemoryPool>(
      45         544 :         arrow::MemoryPool::CreateDefault().release());
      46             : 
      47         272 :     const bool bIsVSI = STARTS_WITH(osBasePath.c_str(), "/vsi");
      48         544 :     auto poDS = std::make_unique<OGRParquetDataset>(poMemoryPool);
      49             :     auto poLayer = std::make_unique<OGRParquetDatasetLayer>(
      50         272 :         poDS.get(), CPLGetBasename(osBasePath.c_str()), bIsVSI, dataset,
      51         544 :         papszOpenOptions);
      52         272 :     poDS->SetLayer(std::move(poLayer));
      53         272 :     poDS->SetFileSystem(fs);
      54         544 :     return poDS.release();
      55             : }
      56             : 
      57             : /************************************************************************/
      58             : /*                         GetFileSystem()                              */
      59             : /************************************************************************/
      60             : 
      61             : static std::tuple<std::shared_ptr<arrow::fs::FileSystem>, std::string>
      62         272 : GetFileSystem(std::string &osBasePathInOut,
      63             :               const std::string &osQueryParameters)
      64             : {
      65             :     // Instantiate file system:
      66             :     // - VSIArrowFileSystem implementation for /vsi files
      67             :     // - base implementation for local files (if OGR_PARQUET_USE_VSI set to NO)
      68         272 :     std::shared_ptr<arrow::fs::FileSystem> fs;
      69         272 :     const bool bIsVSI = STARTS_WITH(osBasePathInOut.c_str(), "/vsi");
      70             :     VSIStatBufL sStat;
      71         544 :     std::string osFSFilename;
      72         458 :     if ((bIsVSI ||
      73         536 :          CPLTestBool(CPLGetConfigOption("OGR_PARQUET_USE_VSI", "YES"))) &&
      74         264 :         VSIStatL(osBasePathInOut.c_str(), &sStat) == 0)
      75             :     {
      76         263 :         osFSFilename = osBasePathInOut;
      77         263 :         fs = std::make_shared<VSIArrowFileSystem>("PARQUET", osQueryParameters);
      78             :     }
      79             :     else
      80             :     {
      81             :         // FileSystemFromUriOrPath() doesn't like relative paths
      82             :         // so transform them to absolute.
      83           9 :         std::string osPath(osBasePathInOut);
      84           9 :         if (CPLIsFilenameRelative(osPath.c_str()))
      85             :         {
      86           8 :             char *pszCurDir = CPLGetCurrentDir();
      87           8 :             if (pszCurDir == nullptr)
      88           0 :                 return {nullptr, osFSFilename};
      89           8 :             osPath = CPLFormFilename(pszCurDir, osPath.c_str(), nullptr);
      90           8 :             CPLFree(pszCurDir);
      91             :         }
      92           9 :         PARQUET_ASSIGN_OR_THROW(
      93             :             fs, arrow::fs::FileSystemFromUriOrPath(osPath, &osFSFilename));
      94             :     }
      95         272 :     return {fs, osFSFilename};
      96             : }
      97             : 
      98             : /************************************************************************/
      99             : /*                  OpenParquetDatasetWithMetadata()                    */
     100             : /************************************************************************/
     101             : 
     102          18 : static GDALDataset *OpenParquetDatasetWithMetadata(
     103             :     const std::string &osBasePathIn, const char *pszMetadataFile,
     104             :     const std::string &osQueryParameters, CSLConstList papszOpenOptions)
     105             : {
     106          36 :     std::string osBasePath(osBasePathIn);
     107          18 :     const auto &[fs, osFSFilename] =
     108          36 :         GetFileSystem(osBasePath, osQueryParameters);
     109             : 
     110          36 :     arrow::dataset::ParquetFactoryOptions options;
     111          36 :     auto partitioningFactory = arrow::dataset::HivePartitioning::MakeFactory();
     112             :     options.partitioning =
     113          18 :         arrow::dataset::PartitioningOrFactory(std::move(partitioningFactory));
     114             : 
     115          18 :     std::shared_ptr<arrow::dataset::DatasetFactory> factory;
     116             :     // coverity[copy_constructor_call]
     117          54 :     PARQUET_ASSIGN_OR_THROW(
     118             :         factory, arrow::dataset::ParquetDatasetFactory::Make(
     119             :                      osFSFilename + '/' + pszMetadataFile, fs,
     120             :                      std::make_shared<arrow::dataset::ParquetFileFormat>(),
     121             :                      std::move(options)));
     122             : 
     123          36 :     return OpenFromDatasetFactory(osBasePath, factory, papszOpenOptions, fs);
     124             : }
     125             : 
     126             : /************************************************************************/
     127             : /*                 OpenParquetDatasetWithoutMetadata()                  */
     128             : /************************************************************************/
     129             : 
     130             : static GDALDataset *
     131         254 : OpenParquetDatasetWithoutMetadata(const std::string &osBasePathIn,
     132             :                                   const std::string &osQueryParameters,
     133             :                                   CSLConstList papszOpenOptions)
     134             : {
     135         508 :     std::string osBasePath(osBasePathIn);
     136         254 :     const auto &[fs, osFSFilename] =
     137         508 :         GetFileSystem(osBasePath, osQueryParameters);
     138             : 
     139         508 :     arrow::dataset::FileSystemFactoryOptions options;
     140         254 :     std::shared_ptr<arrow::dataset::DatasetFactory> factory;
     141             : 
     142         508 :     const auto fileInfo = fs->GetFileInfo(osFSFilename);
     143         254 :     if (fileInfo->IsFile())
     144             :     {
     145             :         // coverity[copy_constructor_call]
     146        1008 :         PARQUET_ASSIGN_OR_THROW(
     147             :             factory, arrow::dataset::FileSystemDatasetFactory::Make(
     148             :                          fs, {std::move(osFSFilename)},
     149             :                          std::make_shared<arrow::dataset::ParquetFileFormat>(),
     150             :                          std::move(options)));
     151             :     }
     152             :     else
     153             :     {
     154             :         auto partitioningFactory =
     155           4 :             arrow::dataset::HivePartitioning::MakeFactory();
     156           4 :         options.partitioning = arrow::dataset::PartitioningOrFactory(
     157           4 :             std::move(partitioningFactory));
     158             : 
     159           4 :         arrow::fs::FileSelector selector;
     160           2 :         selector.base_dir = std::move(osFSFilename);
     161           2 :         selector.recursive = true;
     162             : 
     163             :         // coverity[copy_constructor_call]
     164           4 :         PARQUET_ASSIGN_OR_THROW(
     165             :             factory, arrow::dataset::FileSystemDatasetFactory::Make(
     166             :                          fs, std::move(selector),
     167             :                          std::make_shared<arrow::dataset::ParquetFileFormat>(),
     168             :                          std::move(options)));
     169             :     }
     170             : 
     171         508 :     return OpenFromDatasetFactory(osBasePath, factory, papszOpenOptions, fs);
     172             : }
     173             : 
     174             : #endif
     175             : 
     176             : /************************************************************************/
     177             : /*                  BuildMemDatasetWithRowGroupExtents()                */
     178             : /************************************************************************/
     179             : 
     180             : /** Builds a Memory dataset that contains, for each row-group of the input file,
     181             :  * the feature count and spatial extent of the features of this row group,
     182             :  * using Parquet statistics. This assumes that the Parquet file declares
     183             :  * a "covering":{"bbox":{ ... }} metadata item.
     184             :  *
     185             :  * Only for debug purposes.
     186             :  */
     187           1 : static GDALDataset *BuildMemDatasetWithRowGroupExtents(OGRParquetLayer *poLayer)
     188             : {
     189           1 :     int iParquetXMin = -1;
     190           1 :     int iParquetYMin = -1;
     191           1 :     int iParquetXMax = -1;
     192           1 :     int iParquetYMax = -1;
     193           1 :     if (poLayer->GeomColsBBOXParquet(0, iParquetXMin, iParquetYMin,
     194             :                                      iParquetXMax, iParquetYMax))
     195             :     {
     196           1 :         auto poMemDrv = GetGDALDriverManager()->GetDriverByName("Memory");
     197           1 :         if (!poMemDrv)
     198           0 :             return nullptr;
     199             :         auto poMemDS = std::unique_ptr<GDALDataset>(
     200           2 :             poMemDrv->Create("", 0, 0, 0, GDT_Unknown, nullptr));
     201           1 :         if (!poMemDS)
     202           0 :             return nullptr;
     203           1 :         OGRSpatialReference *poTmpSRS = nullptr;
     204           1 :         const auto poSrcSRS = poLayer->GetSpatialRef();
     205           1 :         if (poSrcSRS)
     206           0 :             poTmpSRS = poSrcSRS->Clone();
     207             :         auto poMemLayer =
     208           1 :             poMemDS->CreateLayer("footprint", poTmpSRS, wkbPolygon, nullptr);
     209           1 :         if (poTmpSRS)
     210           0 :             poTmpSRS->Release();
     211           1 :         if (!poMemLayer)
     212           0 :             return nullptr;
     213           1 :         poMemLayer->CreateField(
     214           1 :             std::make_unique<OGRFieldDefn>("feature_count", OFTInteger64)
     215           1 :                 .get());
     216             : 
     217             :         const auto metadata =
     218           2 :             poLayer->GetReader()->parquet_reader()->metadata();
     219           1 :         const int numRowGroups = metadata->num_row_groups();
     220          15 :         for (int iRowGroup = 0; iRowGroup < numRowGroups; ++iRowGroup)
     221             :         {
     222          28 :             std::string osMinTmp, osMaxTmp;
     223             :             OGRField unusedF;
     224             :             bool unusedB;
     225             :             OGRFieldSubType unusedSubType;
     226             : 
     227             :             OGRField sXMin;
     228          14 :             OGR_RawField_SetNull(&sXMin);
     229          14 :             bool bFoundXMin = false;
     230          14 :             OGRFieldType eXMinType = OFTMaxType;
     231             : 
     232             :             OGRField sYMin;
     233          14 :             OGR_RawField_SetNull(&sYMin);
     234          14 :             bool bFoundYMin = false;
     235          14 :             OGRFieldType eYMinType = OFTMaxType;
     236             : 
     237             :             OGRField sXMax;
     238          14 :             OGR_RawField_SetNull(&sXMax);
     239          14 :             bool bFoundXMax = false;
     240          14 :             OGRFieldType eXMaxType = OFTMaxType;
     241             : 
     242             :             OGRField sYMax;
     243          14 :             OGR_RawField_SetNull(&sYMax);
     244          14 :             bool bFoundYMax = false;
     245          14 :             OGRFieldType eYMaxType = OFTMaxType;
     246             : 
     247          14 :             if (poLayer->GetMinMaxForParquetCol(
     248             :                     iRowGroup, iParquetXMin, nullptr,
     249             :                     /* bComputeMin = */ true, sXMin, bFoundXMin,
     250             :                     /* bComputeMax = */ false, unusedF, unusedB, eXMinType,
     251           8 :                     unusedSubType, osMinTmp, osMaxTmp) &&
     252           8 :                 bFoundXMin && eXMinType == OFTReal &&
     253          22 :                 poLayer->GetMinMaxForParquetCol(
     254             :                     iRowGroup, iParquetYMin, nullptr,
     255             :                     /* bComputeMin = */ true, sYMin, bFoundYMin,
     256             :                     /* bComputeMax = */ false, unusedF, unusedB, eYMinType,
     257           8 :                     unusedSubType, osMinTmp, osMaxTmp) &&
     258           8 :                 bFoundYMin && eYMinType == OFTReal &&
     259          22 :                 poLayer->GetMinMaxForParquetCol(
     260             :                     iRowGroup, iParquetXMax, nullptr,
     261             :                     /* bComputeMin = */ false, unusedF, unusedB,
     262             :                     /* bComputeMax = */ true, sXMax, bFoundXMax, eXMaxType,
     263           8 :                     unusedSubType, osMaxTmp, osMaxTmp) &&
     264           8 :                 bFoundXMax && eXMaxType == OFTReal &&
     265          22 :                 poLayer->GetMinMaxForParquetCol(
     266             :                     iRowGroup, iParquetYMax, nullptr,
     267             :                     /* bComputeMin = */ false, unusedF, unusedB,
     268             :                     /* bComputeMax = */ true, sYMax, bFoundYMax, eYMaxType,
     269           8 :                     unusedSubType, osMaxTmp, osMaxTmp) &&
     270          22 :                 bFoundYMax && eYMaxType == OFTReal)
     271             :             {
     272          16 :                 OGRFeature oFeat(poMemLayer->GetLayerDefn());
     273           8 :                 oFeat.SetField(0,
     274             :                                static_cast<GIntBig>(
     275           8 :                                    metadata->RowGroup(iRowGroup)->num_rows()));
     276          16 :                 auto poPoly = std::make_unique<OGRPolygon>();
     277           8 :                 auto poLR = std::make_unique<OGRLinearRing>();
     278           8 :                 poLR->addPoint(sXMin.Real, sYMin.Real);
     279           8 :                 poLR->addPoint(sXMin.Real, sYMax.Real);
     280           8 :                 poLR->addPoint(sXMax.Real, sYMax.Real);
     281           8 :                 poLR->addPoint(sXMax.Real, sYMin.Real);
     282           8 :                 poLR->addPoint(sXMin.Real, sYMin.Real);
     283           8 :                 poPoly->addRingDirectly(poLR.release());
     284           8 :                 oFeat.SetGeometryDirectly(poPoly.release());
     285           8 :                 CPL_IGNORE_RET_VAL(poMemLayer->CreateFeature(&oFeat));
     286             :             }
     287             :         }
     288             : 
     289           1 :         return poMemDS.release();
     290             :     }
     291           0 :     return nullptr;
     292             : }
     293             : 
     294             : /************************************************************************/
     295             : /*                                Open()                                */
     296             : /************************************************************************/
     297             : 
     298        1600 : static GDALDataset *OGRParquetDriverOpen(GDALOpenInfo *poOpenInfo)
     299             : {
     300        1600 :     if (poOpenInfo->eAccess == GA_Update)
     301          61 :         return nullptr;
     302             : 
     303             : #ifdef GDAL_USE_ARROWDATASET
     304        3078 :     std::string osBasePath(poOpenInfo->pszFilename);
     305        3078 :     std::string osQueryParameters;
     306             :     const bool bStartedWithParquetPrefix =
     307        1539 :         STARTS_WITH(osBasePath.c_str(), "PARQUET:");
     308             : 
     309        1539 :     if (bStartedWithParquetPrefix)
     310             :     {
     311         260 :         osBasePath = osBasePath.substr(strlen("PARQUET:"));
     312             :     }
     313             : 
     314             :     // Little trick to allow using syntax of
     315             :     // https://github.com/opengeospatial/geoparquet/discussions/101
     316             :     // ogrinfo
     317             :     // "/vsicurl/https://ai4edataeuwest.blob.core.windows.net/us-census/2020/cb_2020_us_vtd_500k.parquet?${SAS_TOKEN}"
     318        1539 :     if (STARTS_WITH(osBasePath.c_str(), "/vsicurl/"))
     319             :     {
     320           1 :         const auto nPos = osBasePath.find(".parquet?st=");
     321           1 :         if (nPos != std::string::npos)
     322             :         {
     323           0 :             osQueryParameters = osBasePath.substr(nPos + strlen(".parquet"));
     324           0 :             osBasePath.resize(nPos + strlen(".parquet"));
     325             :         }
     326             :     }
     327             : 
     328        2386 :     if (bStartedWithParquetPrefix || poOpenInfo->bIsDirectory ||
     329         847 :         !osQueryParameters.empty())
     330             :     {
     331             :         VSIStatBufL sStat;
     332         692 :         if (!osBasePath.empty() && osBasePath.back() == '/')
     333           0 :             osBasePath.pop_back();
     334             :         std::string osMetadataPath =
     335         692 :             CPLFormFilename(osBasePath.c_str(), "_metadata", nullptr);
     336         692 :         if (CPLTestBool(
     337        2076 :                 CPLGetConfigOption("OGR_PARQUET_USE_METADATA_FILE", "YES")) &&
     338        1384 :             VSIStatL((osMetadataPath + osQueryParameters).c_str(), &sStat) == 0)
     339             :         {
     340             :             // If there's a _metadata file, then use it to avoid listing files
     341             :             try
     342             :             {
     343          36 :                 return OpenParquetDatasetWithMetadata(
     344             :                     osBasePath, "_metadata", osQueryParameters,
     345          18 :                     poOpenInfo->papszOpenOptions);
     346             :             }
     347           0 :             catch (const std::exception &e)
     348             :             {
     349           0 :                 CPLError(CE_Failure, CPLE_AppDefined, "Parquet exception: %s",
     350           0 :                          e.what());
     351             :             }
     352           0 :             return nullptr;
     353             :         }
     354             :         else
     355             :         {
     356         674 :             bool bLikelyParquetDataset = false;
     357         674 :             if (poOpenInfo->bIsDirectory)
     358             :             {
     359             :                 // Detect if the directory contains .parquet files, or
     360             :                 // subdirectories with a name of the form "key=value", typical
     361             :                 // of HIVE partitioning.
     362         844 :                 const CPLStringList aosFiles(VSIReadDir(osBasePath.c_str()));
     363       22244 :                 for (const char *pszFilename : cpl::Iterate(aosFiles))
     364             :                 {
     365       21824 :                     if (EQUAL(CPLGetExtension(pszFilename), "parquet"))
     366             :                     {
     367           2 :                         bLikelyParquetDataset = true;
     368           2 :                         break;
     369             :                     }
     370       21822 :                     else if (strchr(pszFilename, '='))
     371             :                     {
     372             :                         // HIVE partitioning
     373           0 :                         if (VSIStatL(CPLFormFilename(osBasePath.c_str(),
     374             :                                                      pszFilename, nullptr),
     375           0 :                                      &sStat) == 0 &&
     376           0 :                             VSI_ISDIR(sStat.st_mode))
     377             :                         {
     378           0 :                             bLikelyParquetDataset = true;
     379           0 :                             break;
     380             :                         }
     381             :                     }
     382             :                 }
     383             :             }
     384             : 
     385         674 :             if (bStartedWithParquetPrefix || bLikelyParquetDataset)
     386             :             {
     387             :                 try
     388             :                 {
     389         508 :                     return OpenParquetDatasetWithoutMetadata(
     390             :                         osBasePath, osQueryParameters,
     391         254 :                         poOpenInfo->papszOpenOptions);
     392             :                 }
     393           0 :                 catch (const std::exception &e)
     394             :                 {
     395             :                     // If we aren't quite sure that the passed file name is
     396             :                     // a directory, then silently continue
     397           0 :                     if (poOpenInfo->bIsDirectory)
     398             :                     {
     399           0 :                         CPLError(CE_Failure, CPLE_AppDefined,
     400           0 :                                  "Parquet exception: %s", e.what());
     401           0 :                         return nullptr;
     402             :                     }
     403             :                 }
     404             :             }
     405             :         }
     406             :     }
     407             : #endif
     408             : 
     409        1267 :     if (!OGRParquetDriverIdentify(poOpenInfo))
     410             :     {
     411           0 :         return nullptr;
     412             :     }
     413             : 
     414        1267 :     if (poOpenInfo->bIsDirectory)
     415         420 :         return nullptr;
     416             : 
     417        1694 :     std::string osFilename(poOpenInfo->pszFilename);
     418         847 :     if (STARTS_WITH(poOpenInfo->pszFilename, "PARQUET:"))
     419             :     {
     420           0 :         osFilename = poOpenInfo->pszFilename + strlen("PARQUET:");
     421             :     }
     422             : 
     423             :     try
     424             :     {
     425         847 :         std::shared_ptr<arrow::io::RandomAccessFile> infile;
     426        1310 :         if (STARTS_WITH(osFilename.c_str(), "/vsi") ||
     427         463 :             CPLTestBool(CPLGetConfigOption("OGR_PARQUET_USE_VSI", "NO")))
     428             :         {
     429         384 :             VSIVirtualHandleUniquePtr fp(poOpenInfo->fpL);
     430         384 :             poOpenInfo->fpL = nullptr;
     431         384 :             if (fp == nullptr)
     432             :             {
     433           0 :                 fp.reset(VSIFOpenL(osFilename.c_str(), "rb"));
     434           0 :                 if (fp == nullptr)
     435           0 :                     return nullptr;
     436             :             }
     437         768 :             infile = std::make_shared<OGRArrowRandomAccessFile>(osFilename,
     438         768 :                                                                 std::move(fp));
     439             :         }
     440             :         else
     441             :         {
     442         463 :             PARQUET_ASSIGN_OR_THROW(infile,
     443             :                                     arrow::io::ReadableFile::Open(osFilename));
     444             :         }
     445             : 
     446             :         // Open Parquet file reader
     447         847 :         std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
     448             :         auto poMemoryPool = std::shared_ptr<arrow::MemoryPool>(
     449        1694 :             arrow::MemoryPool::CreateDefault().release());
     450         847 :         auto st = parquet::arrow::OpenFile(std::move(infile),
     451        2541 :                                            poMemoryPool.get(), &arrow_reader);
     452         847 :         if (!st.ok())
     453             :         {
     454           1 :             CPLError(CE_Failure, CPLE_AppDefined,
     455             :                      "parquet::arrow::OpenFile() failed");
     456           1 :             return nullptr;
     457             :         }
     458             : 
     459        1692 :         auto poDS = std::make_unique<OGRParquetDataset>(poMemoryPool);
     460             :         auto poLayer = std::make_unique<OGRParquetLayer>(
     461         846 :             poDS.get(), CPLGetBasename(osFilename.c_str()),
     462        2538 :             std::move(arrow_reader), poOpenInfo->papszOpenOptions);
     463             : 
     464             :         // For debug purposes: return a layer with the extent of each row group
     465         846 :         if (CPLTestBool(
     466             :                 CPLGetConfigOption("OGR_PARQUET_SHOW_ROW_GROUP_EXTENT", "NO")))
     467             :         {
     468           1 :             return BuildMemDatasetWithRowGroupExtents(poLayer.get());
     469             :         }
     470             : 
     471         845 :         poDS->SetLayer(std::move(poLayer));
     472         845 :         return poDS.release();
     473             :     }
     474           0 :     catch (const std::exception &e)
     475             :     {
     476           0 :         CPLError(CE_Failure, CPLE_AppDefined, "Parquet exception: %s",
     477           0 :                  e.what());
     478           0 :         return nullptr;
     479             :     }
     480             : }
     481             : 
     482             : /************************************************************************/
     483             : /*                               Create()                               */
     484             : /************************************************************************/
     485             : 
     486         264 : static GDALDataset *OGRParquetDriverCreate(const char *pszName, int nXSize,
     487             :                                            int nYSize, int nBands,
     488             :                                            GDALDataType eType,
     489             :                                            char ** /* papszOptions */)
     490             : {
     491         264 :     if (!(nXSize == 0 && nYSize == 0 && nBands == 0 && eType == GDT_Unknown))
     492           0 :         return nullptr;
     493             : 
     494             :     try
     495             :     {
     496         264 :         std::shared_ptr<arrow::io::OutputStream> out_file;
     497         342 :         if (STARTS_WITH(pszName, "/vsi") ||
     498          78 :             CPLTestBool(CPLGetConfigOption("OGR_PARQUET_USE_VSI", "YES")))
     499             :         {
     500         264 :             VSILFILE *fp = VSIFOpenL(pszName, "wb");
     501         264 :             if (fp == nullptr)
     502             :             {
     503           1 :                 CPLError(CE_Failure, CPLE_FileIO, "Cannot create %s", pszName);
     504           1 :                 return nullptr;
     505             :             }
     506         263 :             out_file = std::make_shared<OGRArrowWritableFile>(fp);
     507             :         }
     508             :         else
     509             :         {
     510           0 :             PARQUET_ASSIGN_OR_THROW(out_file,
     511             :                                     arrow::io::FileOutputStream::Open(pszName));
     512             :         }
     513             : 
     514         263 :         return new OGRParquetWriterDataset(out_file);
     515             :     }
     516           0 :     catch (const std::exception &e)
     517             :     {
     518           0 :         CPLError(CE_Failure, CPLE_AppDefined, "Parquet exception: %s",
     519           0 :                  e.what());
     520           0 :         return nullptr;
     521             :     }
     522             : }
     523             : 
     524             : /************************************************************************/
     525             : /*                         OGRParquetDriver()                           */
     526             : /************************************************************************/
     527             : 
     528             : class OGRParquetDriver final : public GDALDriver
     529             : {
     530             :     bool m_bMetadataInitialized = false;
     531             :     void InitMetadata();
     532             : 
     533             :   public:
     534        1745 :     const char *GetMetadataItem(const char *pszName,
     535             :                                 const char *pszDomain) override
     536             :     {
     537        1745 :         if (EQUAL(pszName, GDAL_DS_LAYER_CREATIONOPTIONLIST))
     538             :         {
     539         296 :             InitMetadata();
     540             :         }
     541        1745 :         return GDALDriver::GetMetadataItem(pszName, pszDomain);
     542             :     }
     543             : 
     544          40 :     char **GetMetadata(const char *pszDomain) override
     545             :     {
     546          40 :         InitMetadata();
     547          40 :         return GDALDriver::GetMetadata(pszDomain);
     548             :     }
     549             : };
     550             : 
     551         336 : void OGRParquetDriver::InitMetadata()
     552             : {
     553         336 :     if (m_bMetadataInitialized)
     554         321 :         return;
     555          15 :     m_bMetadataInitialized = true;
     556             : 
     557             :     CPLXMLTreeCloser oTree(
     558          30 :         CPLCreateXMLNode(nullptr, CXT_Element, "LayerCreationOptionList"));
     559             : 
     560          30 :     std::vector<const char *> apszCompressionMethods;
     561          15 :     bool bHasSnappy = false;
     562         105 :     for (const char *pszMethod :
     563         120 :          {"SNAPPY", "GZIP", "BROTLI", "ZSTD", "LZ4_RAW", "LZO", "LZ4_HADOOP"})
     564             :     {
     565             :         auto oResult = arrow::util::Codec::GetCompressionType(
     566         210 :             CPLString(pszMethod).tolower());
     567         105 :         if (oResult.ok() && arrow::util::Codec::IsAvailable(*oResult))
     568             :         {
     569          90 :             if (EQUAL(pszMethod, "SNAPPY"))
     570          15 :                 bHasSnappy = true;
     571          90 :             apszCompressionMethods.emplace_back(pszMethod);
     572             :         }
     573             :     }
     574             : 
     575             :     {
     576          15 :         auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
     577          15 :         CPLAddXMLAttributeAndValue(psOption, "name", "COMPRESSION");
     578          15 :         CPLAddXMLAttributeAndValue(psOption, "type", "string-select");
     579          15 :         CPLAddXMLAttributeAndValue(psOption, "description",
     580             :                                    "Compression method");
     581          15 :         CPLAddXMLAttributeAndValue(psOption, "default",
     582             :                                    bHasSnappy ? "SNAPPY" : "NONE");
     583             :         {
     584          15 :             auto poValueNode = CPLCreateXMLNode(psOption, CXT_Element, "Value");
     585          15 :             CPLAddXMLAttributeAndValue(poValueNode, "alias", "UNCOMPRESSED");
     586          15 :             CPLCreateXMLNode(poValueNode, CXT_Text, "NONE");
     587             :         }
     588         105 :         for (const char *pszMethod : apszCompressionMethods)
     589             :         {
     590          90 :             auto poValueNode = CPLCreateXMLNode(psOption, CXT_Element, "Value");
     591          90 :             CPLCreateXMLNode(poValueNode, CXT_Text, pszMethod);
     592             :         }
     593             :     }
     594             : 
     595             :     {
     596          15 :         auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
     597          15 :         CPLAddXMLAttributeAndValue(psOption, "name", "GEOMETRY_ENCODING");
     598          15 :         CPLAddXMLAttributeAndValue(psOption, "type", "string-select");
     599          15 :         CPLAddXMLAttributeAndValue(psOption, "description",
     600             :                                    "Encoding of geometry columns");
     601          15 :         CPLAddXMLAttributeAndValue(psOption, "default", "WKB");
     602          60 :         for (const char *pszEncoding :
     603          75 :              {"WKB", "WKT", "GEOARROW", "GEOARROW_INTERLEAVED"})
     604             :         {
     605          60 :             auto poValueNode = CPLCreateXMLNode(psOption, CXT_Element, "Value");
     606          60 :             CPLCreateXMLNode(poValueNode, CXT_Text, pszEncoding);
     607          60 :             if (EQUAL(pszEncoding, "GEOARROW"))
     608          15 :                 CPLAddXMLAttributeAndValue(poValueNode, "alias",
     609             :                                            "GEOARROW_STRUCT");
     610             :         }
     611             :     }
     612             : 
     613             :     {
     614          15 :         auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
     615          15 :         CPLAddXMLAttributeAndValue(psOption, "name", "ROW_GROUP_SIZE");
     616          15 :         CPLAddXMLAttributeAndValue(psOption, "type", "integer");
     617          15 :         CPLAddXMLAttributeAndValue(psOption, "description",
     618             :                                    "Maximum number of rows per group");
     619          15 :         CPLAddXMLAttributeAndValue(psOption, "default", "65536");
     620             :     }
     621             : 
     622             :     {
     623          15 :         auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
     624          15 :         CPLAddXMLAttributeAndValue(psOption, "name", "GEOMETRY_NAME");
     625          15 :         CPLAddXMLAttributeAndValue(psOption, "type", "string");
     626          15 :         CPLAddXMLAttributeAndValue(psOption, "description",
     627             :                                    "Name of geometry column");
     628          15 :         CPLAddXMLAttributeAndValue(psOption, "default", "geometry");
     629             :     }
     630             : 
     631             :     {
     632          15 :         auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
     633          15 :         CPLAddXMLAttributeAndValue(psOption, "name", "COORDINATE_PRECISION");
     634          15 :         CPLAddXMLAttributeAndValue(psOption, "type", "float");
     635          15 :         CPLAddXMLAttributeAndValue(psOption, "description",
     636             :                                    "Number of decimals for coordinates (only "
     637             :                                    "for GEOMETRY_ENCODING=WKT)");
     638             :     }
     639             : 
     640             :     {
     641          15 :         auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
     642          15 :         CPLAddXMLAttributeAndValue(psOption, "name", "FID");
     643          15 :         CPLAddXMLAttributeAndValue(psOption, "type", "string");
     644          15 :         CPLAddXMLAttributeAndValue(psOption, "description",
     645             :                                    "Name of the FID column to create");
     646             :     }
     647             : 
     648             :     {
     649          15 :         auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
     650          15 :         CPLAddXMLAttributeAndValue(psOption, "name", "POLYGON_ORIENTATION");
     651          15 :         CPLAddXMLAttributeAndValue(psOption, "type", "string-select");
     652          15 :         CPLAddXMLAttributeAndValue(
     653             :             psOption, "description",
     654             :             "Which ring orientation to use for polygons");
     655          15 :         CPLAddXMLAttributeAndValue(psOption, "default", "COUNTERCLOCKWISE");
     656          15 :         CPLCreateXMLElementAndValue(psOption, "Value", "COUNTERCLOCKWISE");
     657          15 :         CPLCreateXMLElementAndValue(psOption, "Value", "UNMODIFIED");
     658             :     }
     659             : 
     660             :     {
     661          15 :         auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
     662          15 :         CPLAddXMLAttributeAndValue(psOption, "name", "EDGES");
     663          15 :         CPLAddXMLAttributeAndValue(psOption, "type", "string-select");
     664          15 :         CPLAddXMLAttributeAndValue(
     665             :             psOption, "description",
     666             :             "Name of the coordinate system for the edges");
     667          15 :         CPLAddXMLAttributeAndValue(psOption, "default", "PLANAR");
     668          15 :         CPLCreateXMLElementAndValue(psOption, "Value", "PLANAR");
     669          15 :         CPLCreateXMLElementAndValue(psOption, "Value", "SPHERICAL");
     670             :     }
     671             : 
     672             :     {
     673          15 :         auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
     674          15 :         CPLAddXMLAttributeAndValue(psOption, "name", "CREATOR");
     675          15 :         CPLAddXMLAttributeAndValue(psOption, "type", "string");
     676          15 :         CPLAddXMLAttributeAndValue(psOption, "description",
     677             :                                    "Name of creating application");
     678             :     }
     679             : 
     680             :     {
     681          15 :         auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
     682          15 :         CPLAddXMLAttributeAndValue(psOption, "name", "WRITE_COVERING_BBOX");
     683          15 :         CPLAddXMLAttributeAndValue(psOption, "type", "boolean");
     684          15 :         CPLAddXMLAttributeAndValue(psOption, "default", "YES");
     685          15 :         CPLAddXMLAttributeAndValue(psOption, "description",
     686             :                                    "Whether to write xmin/ymin/xmax/ymax "
     687             :                                    "columns with the bounding box of "
     688             :                                    "geometries");
     689             :     }
     690             : 
     691             :     {
     692          15 :         auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option");
     693          15 :         CPLAddXMLAttributeAndValue(psOption, "name", "SORT_BY_BBOX");
     694          15 :         CPLAddXMLAttributeAndValue(psOption, "type", "boolean");
     695          15 :         CPLAddXMLAttributeAndValue(psOption, "default", "NO");
     696          15 :         CPLAddXMLAttributeAndValue(psOption, "description",
     697             :                                    "Whether features should be sorted based on "
     698             :                                    "the bounding box of their geometries");
     699             :     }
     700             : 
     701          15 :     char *pszXML = CPLSerializeXMLTree(oTree.get());
     702          15 :     GDALDriver::SetMetadataItem(GDAL_DS_LAYER_CREATIONOPTIONLIST, pszXML);
     703          15 :     CPLFree(pszXML);
     704             : }
     705             : 
     706             : /************************************************************************/
     707             : /*                         RegisterOGRParquet()                         */
     708             : /************************************************************************/
     709             : 
     710          30 : void RegisterOGRParquet()
     711             : {
     712          30 :     if (GDALGetDriverByName(DRIVER_NAME) != nullptr)
     713           0 :         return;
     714             : 
     715          60 :     auto poDriver = std::make_unique<OGRParquetDriver>();
     716          30 :     OGRParquetDriverSetCommonMetadata(poDriver.get());
     717             : 
     718          30 :     poDriver->pfnOpen = OGRParquetDriverOpen;
     719          30 :     poDriver->pfnCreate = OGRParquetDriverCreate;
     720             : 
     721          30 :     poDriver->SetMetadataItem("ARROW_VERSION", ARROW_VERSION_STRING);
     722             : #ifdef GDAL_USE_ARROWDATASET
     723          30 :     poDriver->SetMetadataItem("ARROW_DATASET", "YES");
     724             : #endif
     725             : 
     726          30 :     GetGDALDriverManager()->RegisterDriver(poDriver.release());
     727             : 
     728             : #if ARROW_VERSION_MAJOR >= 16
     729             :     // Mostly for tests
     730             :     const char *pszPath =
     731          30 :         CPLGetConfigOption("OGR_PARQUET_LOAD_FILE_SYSTEM_FACTORIES", nullptr);
     732          30 :     if (pszPath)
     733             :     {
     734           0 :         auto result = arrow::fs::LoadFileSystemFactories(pszPath);
     735           0 :         if (!result.ok())
     736             :         {
     737           0 :             CPLError(CE_Warning, CPLE_AppDefined,
     738             :                      "arrow::fs::LoadFileSystemFactories() failed with %s",
     739           0 :                      result.message().c_str());
     740             :         }
     741             :     }
     742             : #endif
     743             : }

Generated by: LCOV version 1.14