LCOV - code coverage report
Current view: top level - ogr/ogrsf_frmts/parquet - ogrparquetwriterlayer.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 519 602 86.2 %
Date: 2025-01-18 12:42:00 Functions: 28 28 100.0 %

          Line data    Source code
       1             : /******************************************************************************
       2             :  *
       3             :  * Project:  Parquet Translator
       4             :  * Purpose:  Implements OGRParquetDriver.
       5             :  * Author:   Even Rouault, <even.rouault at spatialys.com>
       6             :  *
       7             :  ******************************************************************************
       8             :  * Copyright (c) 2022, Planet Labs
       9             :  *
      10             :  * SPDX-License-Identifier: MIT
      11             :  ****************************************************************************/
      12             : 
      13             : #ifdef STANDALONE
      14             : #include "gdal_version.h"
      15             : #else
      16             : #undef DO_NOT_DEFINE_GDAL_DATE_NAME
      17             : #include "gdal_version_full/gdal_version.h"
      18             : #endif
      19             : 
      20             : #include "ogr_parquet.h"
      21             : 
      22             : #include "../arrow_common/ograrrowwriterlayer.hpp"
      23             : 
      24             : #include "ogr_wkb.h"
      25             : 
      26             : #include <utility>
      27             : 
      28             : /************************************************************************/
      29             : /*                      OGRParquetWriterLayer()                         */
      30             : /************************************************************************/
      31             : 
      32         265 : OGRParquetWriterLayer::OGRParquetWriterLayer(
      33             :     OGRParquetWriterDataset *poDataset, arrow::MemoryPool *poMemoryPool,
      34             :     const std::shared_ptr<arrow::io::OutputStream> &poOutputStream,
      35         265 :     const char *pszLayerName)
      36             :     : OGRArrowWriterLayer(poMemoryPool, poOutputStream, pszLayerName),
      37         265 :       m_poDataset(poDataset)
      38             : {
      39         265 :     m_bWriteFieldArrowExtensionName = CPLTestBool(
      40             :         CPLGetConfigOption("OGR_PARQUET_WRITE_ARROW_EXTENSION_NAME", "NO"));
      41         265 : }
      42             : 
      43             : /************************************************************************/
      44             : /*                                Close()                               */
      45             : /************************************************************************/
      46             : 
      47         262 : bool OGRParquetWriterLayer::Close()
      48             : {
      49         262 :     if (m_poTmpGPKGLayer)
      50             :     {
      51           2 :         if (!CopyTmpGpkgLayerToFinalFile())
      52           0 :             return false;
      53             :     }
      54             : 
      55         262 :     if (m_bInitializationOK)
      56             :     {
      57         262 :         if (!FinalizeWriting())
      58           0 :             return false;
      59             :     }
      60             : 
      61         262 :     return true;
      62             : }
      63             : 
      64             : /************************************************************************/
      65             : /*                     CopyTmpGpkgLayerToFinalFile()                    */
      66             : /************************************************************************/
      67             : 
      68           2 : bool OGRParquetWriterLayer::CopyTmpGpkgLayerToFinalFile()
      69             : {
      70           2 :     if (!m_poTmpGPKGLayer)
      71             :     {
      72           0 :         return true;
      73             :     }
      74             : 
      75           2 :     CPLDebug("PARQUET", "CopyTmpGpkgLayerToFinalFile(): start...");
      76             : 
      77           2 :     VSIUnlink(m_poTmpGPKG->GetDescription());
      78             : 
      79           4 :     OGRFeature oFeat(m_poFeatureDefn);
      80             : 
      81             :     // Interval in terms of features between 2 debug progress report messages
      82           2 :     constexpr int PROGRESS_FC_INTERVAL = 100 * 1000;
      83             : 
      84             :     // First, write features without geometries
      85             :     {
      86           2 :         auto poTmpLayer = std::unique_ptr<OGRLayer>(m_poTmpGPKG->ExecuteSQL(
      87             :             "SELECT serialized_feature FROM tmp WHERE fid NOT IN (SELECT id "
      88             :             "FROM rtree_tmp_geom)",
      89           2 :             nullptr, nullptr));
      90           2 :         if (!poTmpLayer)
      91           0 :             return false;
      92        1004 :         for (const auto &poSrcFeature : poTmpLayer.get())
      93             :         {
      94        1002 :             int nBytesFeature = 0;
      95             :             const GByte *pabyFeatureData =
      96        1002 :                 poSrcFeature->GetFieldAsBinary(0, &nBytesFeature);
      97        1002 :             if (!oFeat.DeserializeFromBinary(pabyFeatureData, nBytesFeature))
      98             :             {
      99           0 :                 CPLError(CE_Failure, CPLE_AppDefined,
     100             :                          "Cannot deserialize feature");
     101           0 :                 return false;
     102             :             }
     103        1002 :             if (OGRArrowWriterLayer::ICreateFeature(&oFeat) != OGRERR_NONE)
     104             :             {
     105           0 :                 return false;
     106             :             }
     107             : 
     108        1002 :             if ((m_nFeatureCount % PROGRESS_FC_INTERVAL) == 0)
     109             :             {
     110           0 :                 CPLDebugProgress(
     111             :                     "PARQUET",
     112             :                     "CopyTmpGpkgLayerToFinalFile(): %.02f%% progress",
     113           0 :                     100.0 * double(m_nFeatureCount) /
     114           0 :                         double(m_nTmpFeatureCount));
     115             :             }
     116             :         }
     117             : 
     118           2 :         if (!FlushFeatures())
     119             :         {
     120           0 :             return false;
     121             :         }
     122             :     }
     123             : 
     124             :     // Now walk through the GPKG RTree for features with geometries
     125             :     // Cf https://github.com/sqlite/sqlite/blob/master/ext/rtree/rtree.c
     126             :     // for the description of the content of the rtree _node table
     127           4 :     std::vector<std::pair<int64_t, int>> aNodeNoDepthPair;
     128           2 :     int nTreeDepth = 0;
     129             :     // Queue the root node
     130             :     aNodeNoDepthPair.emplace_back(
     131           2 :         std::make_pair(/* nodeNo = */ 1, /* depth = */ 0));
     132           2 :     int nCountWrittenFeaturesSinceLastFlush = 0;
     133          50 :     while (!aNodeNoDepthPair.empty())
     134             :     {
     135          48 :         const auto &oLastPair = aNodeNoDepthPair.back();
     136          48 :         const int64_t nNodeNo = oLastPair.first;
     137          48 :         const int nCurDepth = oLastPair.second;
     138             :         //CPLDebug("PARQUET", "Reading nodeNode=%d, curDepth=%d", int(nNodeNo), nCurDepth);
     139          48 :         aNodeNoDepthPair.pop_back();
     140             : 
     141          48 :         auto poRTreeLayer = std::unique_ptr<OGRLayer>(m_poTmpGPKG->ExecuteSQL(
     142             :             CPLSPrintf("SELECT data FROM rtree_tmp_geom_node WHERE nodeno "
     143             :                        "= " CPL_FRMT_GIB,
     144             :                        static_cast<GIntBig>(nNodeNo)),
     145          48 :             nullptr, nullptr));
     146          48 :         if (!poRTreeLayer)
     147             :         {
     148           0 :             CPLError(CE_Failure, CPLE_AppDefined,
     149             :                      "Cannot read node " CPL_FRMT_GIB,
     150             :                      static_cast<GIntBig>(nNodeNo));
     151           0 :             return false;
     152             :         }
     153             :         const auto poRTreeFeature =
     154          48 :             std::unique_ptr<const OGRFeature>(poRTreeLayer->GetNextFeature());
     155          48 :         if (!poRTreeFeature)
     156             :         {
     157           0 :             CPLError(CE_Failure, CPLE_AppDefined,
     158             :                      "Cannot read node " CPL_FRMT_GIB,
     159             :                      static_cast<GIntBig>(nNodeNo));
     160           0 :             return false;
     161             :         }
     162             : 
     163          48 :         int nNodeBytes = 0;
     164             :         const GByte *pabyNodeData =
     165          48 :             poRTreeFeature->GetFieldAsBinary(0, &nNodeBytes);
     166          48 :         constexpr int BLOB_HEADER_SIZE = 4;
     167          48 :         if (nNodeBytes < BLOB_HEADER_SIZE)
     168             :         {
     169           0 :             CPLError(CE_Failure, CPLE_AppDefined,
     170             :                      "Not enough bytes when reading node " CPL_FRMT_GIB,
     171             :                      static_cast<GIntBig>(nNodeNo));
     172           0 :             return false;
     173             :         }
     174          48 :         if (nNodeNo == 1)
     175             :         {
     176             :             // Get the RTree depth from the root node
     177           2 :             nTreeDepth = (pabyNodeData[0] << 8) | pabyNodeData[1];
     178             :             //CPLDebug("PARQUET", "nTreeDepth = %d", nTreeDepth);
     179             :         }
     180             : 
     181          48 :         const int nCellCount = (pabyNodeData[2] << 8) | pabyNodeData[3];
     182          48 :         constexpr int SIZEOF_CELL = 24;  // int64_t + 4 float
     183          48 :         if (nNodeBytes < BLOB_HEADER_SIZE + SIZEOF_CELL * nCellCount)
     184             :         {
     185           0 :             CPLError(CE_Failure, CPLE_AppDefined,
     186             :                      "Not enough bytes when reading node " CPL_FRMT_GIB,
     187             :                      static_cast<GIntBig>(nNodeNo));
     188           0 :             return false;
     189             :         }
     190             : 
     191          48 :         size_t nOffset = BLOB_HEADER_SIZE;
     192          48 :         if (nCurDepth == nTreeDepth)
     193             :         {
     194             :             // Leaf node: it references feature IDs.
     195             : 
     196             :             // If we are about to go above m_nRowGroupSize, flush past
     197             :             // features now, to improve the spatial compacity of the row group.
     198          46 :             if (m_nRowGroupSize > nCellCount &&
     199          46 :                 nCountWrittenFeaturesSinceLastFlush + nCellCount >
     200          46 :                     m_nRowGroupSize)
     201             :             {
     202          14 :                 nCountWrittenFeaturesSinceLastFlush = 0;
     203          14 :                 if (!FlushFeatures())
     204             :                 {
     205           0 :                     return false;
     206             :                 }
     207             :             }
     208             : 
     209             :             // nCellCount shouldn't be over 51 normally, but even 65535
     210             :             // would be fine...
     211             :             // coverity[tainted_data]
     212        1248 :             for (int i = 0; i < nCellCount; ++i)
     213             :             {
     214             :                 int64_t nFID;
     215        1202 :                 memcpy(&nFID, pabyNodeData + nOffset, sizeof(int64_t));
     216        1202 :                 CPL_MSBPTR64(&nFID);
     217             : 
     218             :                 const auto poSrcFeature = std::unique_ptr<const OGRFeature>(
     219        1202 :                     m_poTmpGPKGLayer->GetFeature(nFID));
     220        1202 :                 if (!poSrcFeature)
     221             :                 {
     222           0 :                     CPLError(CE_Failure, CPLE_AppDefined,
     223             :                              "Cannot get feature " CPL_FRMT_GIB,
     224             :                              static_cast<GIntBig>(nFID));
     225           0 :                     return false;
     226             :                 }
     227             : 
     228        1202 :                 int nBytesFeature = 0;
     229             :                 const GByte *pabyFeatureData =
     230        1202 :                     poSrcFeature->GetFieldAsBinary(0, &nBytesFeature);
     231        1202 :                 if (!oFeat.DeserializeFromBinary(pabyFeatureData,
     232             :                                                  nBytesFeature))
     233             :                 {
     234           0 :                     CPLError(CE_Failure, CPLE_AppDefined,
     235             :                              "Cannot deserialize feature");
     236           0 :                     return false;
     237             :                 }
     238        1202 :                 if (OGRArrowWriterLayer::ICreateFeature(&oFeat) != OGRERR_NONE)
     239             :                 {
     240           0 :                     return false;
     241             :                 }
     242             : 
     243        1202 :                 nOffset += SIZEOF_CELL;
     244             : 
     245        1202 :                 ++nCountWrittenFeaturesSinceLastFlush;
     246             : 
     247        1202 :                 if ((m_nFeatureCount % PROGRESS_FC_INTERVAL) == 0 ||
     248        1202 :                     m_nFeatureCount == m_nTmpFeatureCount / 2)
     249             :                 {
     250           2 :                     CPLDebugProgress(
     251             :                         "PARQUET",
     252             :                         "CopyTmpGpkgLayerToFinalFile(): %.02f%% progress",
     253           2 :                         100.0 * double(m_nFeatureCount) /
     254           2 :                             double(m_nTmpFeatureCount));
     255             :                 }
     256             :             }
     257             :         }
     258             :         else
     259             :         {
     260             :             // Non-leaf node: it references child nodes.
     261             : 
     262             :             // nCellCount shouldn't be over 51 normally, but even 65535
     263             :             // would be fine...
     264             :             // coverity[tainted_data]
     265          48 :             for (int i = 0; i < nCellCount; ++i)
     266             :             {
     267             :                 int64_t nNode;
     268          46 :                 memcpy(&nNode, pabyNodeData + nOffset, sizeof(int64_t));
     269          46 :                 CPL_MSBPTR64(&nNode);
     270             :                 aNodeNoDepthPair.emplace_back(
     271          46 :                     std::make_pair(nNode, nCurDepth + 1));
     272          46 :                 nOffset += SIZEOF_CELL;
     273             :             }
     274             :         }
     275             :     }
     276             : 
     277           2 :     CPLDebug("PARQUET",
     278             :              "CopyTmpGpkgLayerToFinalFile(): 100%%, successfully finished");
     279           2 :     return true;
     280             : }
     281             : 
     282             : /************************************************************************/
     283             : /*                       IsSupportedGeometryType()                      */
     284             : /************************************************************************/
     285             : 
     286         269 : bool OGRParquetWriterLayer::IsSupportedGeometryType(
     287             :     OGRwkbGeometryType eGType) const
     288             : {
     289         269 :     const auto eFlattenType = wkbFlatten(eGType);
     290         269 :     if (!OGR_GT_HasM(eGType) && eFlattenType <= wkbGeometryCollection)
     291             :     {
     292         268 :         return true;
     293             :     }
     294             : 
     295             :     const auto osConfigOptionName =
     296           3 :         "OGR_" + GetDriverUCName() + "_ALLOW_ALL_DIMS";
     297           1 :     if (CPLTestBool(CPLGetConfigOption(osConfigOptionName.c_str(), "NO")))
     298             :     {
     299           0 :         return true;
     300             :     }
     301             : 
     302           1 :     CPLError(CE_Failure, CPLE_NotSupported,
     303             :              "Only 2D and Z geometry types are supported (unless the "
     304             :              "%s configuration option is set to YES)",
     305             :              osConfigOptionName.c_str());
     306           1 :     return false;
     307             : }
     308             : 
     309             : /************************************************************************/
     310             : /*                           SetOptions()                               */
     311             : /************************************************************************/
     312             : 
     313         265 : bool OGRParquetWriterLayer::SetOptions(CSLConstList papszOptions,
     314             :                                        const OGRSpatialReference *poSpatialRef,
     315             :                                        OGRwkbGeometryType eGType)
     316             : {
     317         265 :     m_bWriteBBoxStruct = CPLTestBool(CSLFetchNameValueDef(
     318             :         papszOptions, "WRITE_COVERING_BBOX",
     319             :         CPLGetConfigOption("OGR_PARQUET_WRITE_COVERING_BBOX", "YES")));
     320             : 
     321         265 :     if (CPLTestBool(CSLFetchNameValueDef(papszOptions, "SORT_BY_BBOX", "NO")))
     322             :     {
     323           6 :         const std::string osTmpGPKG(std::string(m_poDataset->GetDescription()) +
     324           3 :                                     ".tmp.gpkg");
     325           3 :         auto poGPKGDrv = GetGDALDriverManager()->GetDriverByName("GPKG");
     326           3 :         if (!poGPKGDrv)
     327             :         {
     328           1 :             CPLError(
     329             :                 CE_Failure, CPLE_AppDefined,
     330             :                 "Driver GPKG required for SORT_BY_BBOX layer creation option");
     331           1 :             return false;
     332             :         }
     333           2 :         m_poTmpGPKG.reset(poGPKGDrv->Create(osTmpGPKG.c_str(), 0, 0, 0,
     334             :                                             GDT_Unknown, nullptr));
     335           2 :         if (!m_poTmpGPKG)
     336           0 :             return false;
     337           2 :         m_poTmpGPKG->MarkSuppressOnClose();
     338           2 :         m_poTmpGPKGLayer = m_poTmpGPKG->CreateLayer("tmp");
     339           2 :         if (!m_poTmpGPKGLayer)
     340           0 :             return false;
     341             :         // Serialized feature
     342           2 :         m_poTmpGPKGLayer->CreateField(
     343           2 :             std::make_unique<OGRFieldDefn>("serialized_feature", OFTBinary)
     344           2 :                 .get());
     345           2 :         CPL_IGNORE_RET_VAL(m_poTmpGPKGLayer->StartTransaction());
     346             :     }
     347             : 
     348             :     const char *pszGeomEncoding =
     349         264 :         CSLFetchNameValue(papszOptions, "GEOMETRY_ENCODING");
     350         264 :     m_eGeomEncoding = OGRArrowGeomEncoding::WKB;
     351         264 :     if (pszGeomEncoding)
     352             :     {
     353         148 :         if (EQUAL(pszGeomEncoding, "WKB"))
     354           0 :             m_eGeomEncoding = OGRArrowGeomEncoding::WKB;
     355         148 :         else if (EQUAL(pszGeomEncoding, "WKT"))
     356           8 :             m_eGeomEncoding = OGRArrowGeomEncoding::WKT;
     357         140 :         else if (EQUAL(pszGeomEncoding, "GEOARROW_INTERLEAVED"))
     358             :         {
     359          28 :             CPLErrorOnce(
     360             :                 CE_Warning, CPLE_AppDefined,
     361             :                 "Use of GEOMETRY_ENCODING=GEOARROW_INTERLEAVED is not "
     362             :                 "recommended. "
     363             :                 "GeoParquet 1.1 uses GEOMETRY_ENCODING=GEOARROW (struct) "
     364             :                 "instead.");
     365          28 :             m_eGeomEncoding = OGRArrowGeomEncoding::GEOARROW_FSL_GENERIC;
     366             :         }
     367         112 :         else if (EQUAL(pszGeomEncoding, "GEOARROW") ||
     368           0 :                  EQUAL(pszGeomEncoding, "GEOARROW_STRUCT"))
     369         112 :             m_eGeomEncoding = OGRArrowGeomEncoding::GEOARROW_STRUCT_GENERIC;
     370             :         else
     371             :         {
     372           0 :             CPLError(CE_Failure, CPLE_NotSupported,
     373             :                      "Unsupported GEOMETRY_ENCODING = %s", pszGeomEncoding);
     374           0 :             return false;
     375             :         }
     376             :     }
     377             : 
     378             :     const char *pszCoordPrecision =
     379         264 :         CSLFetchNameValue(papszOptions, "COORDINATE_PRECISION");
     380         264 :     if (pszCoordPrecision)
     381           0 :         m_nWKTCoordinatePrecision = atoi(pszCoordPrecision);
     382             : 
     383         264 :     m_bForceCounterClockwiseOrientation =
     384         264 :         EQUAL(CSLFetchNameValueDef(papszOptions, "POLYGON_ORIENTATION",
     385             :                                    "COUNTERCLOCKWISE"),
     386             :               "COUNTERCLOCKWISE");
     387             : 
     388         264 :     if (eGType != wkbNone)
     389             :     {
     390         243 :         if (!IsSupportedGeometryType(eGType))
     391             :         {
     392           1 :             return false;
     393             :         }
     394             : 
     395         242 :         m_poFeatureDefn->SetGeomType(eGType);
     396         242 :         auto eGeomEncoding = m_eGeomEncoding;
     397         242 :         if (eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_FSL_GENERIC ||
     398         214 :             eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_STRUCT_GENERIC)
     399             :         {
     400         140 :             const auto eEncodingType = eGeomEncoding;
     401         140 :             eGeomEncoding = GetPreciseArrowGeomEncoding(eEncodingType, eGType);
     402         140 :             if (eGeomEncoding == eEncodingType)
     403           0 :                 return false;
     404             :         }
     405         242 :         m_aeGeomEncoding.push_back(eGeomEncoding);
     406         242 :         m_poFeatureDefn->GetGeomFieldDefn(0)->SetName(
     407             :             CSLFetchNameValueDef(papszOptions, "GEOMETRY_NAME", "geometry"));
     408         242 :         if (poSpatialRef)
     409             :         {
     410          25 :             auto poSRS = poSpatialRef->Clone();
     411          25 :             m_poFeatureDefn->GetGeomFieldDefn(0)->SetSpatialRef(poSRS);
     412          25 :             poSRS->Release();
     413             :         }
     414             :     }
     415             : 
     416         263 :     m_osFIDColumn = CSLFetchNameValueDef(papszOptions, "FID", "");
     417             : 
     418         263 :     const char *pszCompression = CSLFetchNameValue(papszOptions, "COMPRESSION");
     419         263 :     if (pszCompression == nullptr)
     420             :     {
     421         777 :         auto oResult = arrow::util::Codec::GetCompressionType("snappy");
     422         259 :         if (oResult.ok() && arrow::util::Codec::IsAvailable(*oResult))
     423             :         {
     424         259 :             pszCompression = "SNAPPY";
     425             :         }
     426             :         else
     427             :         {
     428           0 :             pszCompression = "NONE";
     429             :         }
     430             :     }
     431             : 
     432         263 :     if (EQUAL(pszCompression, "NONE"))
     433           0 :         pszCompression = "UNCOMPRESSED";
     434             :     auto oResult = arrow::util::Codec::GetCompressionType(
     435         526 :         CPLString(pszCompression).tolower());
     436         263 :     if (!oResult.ok())
     437             :     {
     438           1 :         CPLError(CE_Failure, CPLE_NotSupported,
     439             :                  "Unrecognized compression method: %s", pszCompression);
     440           1 :         return false;
     441             :     }
     442         262 :     m_eCompression = *oResult;
     443         262 :     if (!arrow::util::Codec::IsAvailable(m_eCompression))
     444             :     {
     445           0 :         CPLError(CE_Failure, CPLE_NotSupported,
     446             :                  "Compression method %s is known, but libarrow has not "
     447             :                  "been built with support for it",
     448             :                  pszCompression);
     449           0 :         return false;
     450             :     }
     451             : 
     452         262 :     m_oWriterPropertiesBuilder.compression(m_eCompression);
     453             :     const std::string osCreator =
     454         262 :         CSLFetchNameValueDef(papszOptions, "CREATOR", "");
     455         262 :     if (!osCreator.empty())
     456           1 :         m_oWriterPropertiesBuilder.created_by(osCreator);
     457             :     else
     458         261 :         m_oWriterPropertiesBuilder.created_by("GDAL " GDAL_RELEASE_NAME
     459             :                                               ", using " CREATED_BY_VERSION);
     460             : 
     461             :     // Undocumented option. Not clear it is useful besides unit test purposes
     462         262 :     if (!CPLTestBool(CSLFetchNameValueDef(papszOptions, "STATISTICS", "YES")))
     463           1 :         m_oWriterPropertiesBuilder.disable_statistics();
     464             : 
     465             : #if PARQUET_VERSION_MAJOR >= 12
     466             :     // Undocumented option. Not clear it is useful to disable it.
     467         262 :     if (CPLTestBool(CSLFetchNameValueDef(papszOptions, "PAGE_INDEX", "YES")))
     468         262 :         m_oWriterPropertiesBuilder.enable_write_page_index();
     469             : #endif
     470             : 
     471         262 :     if (m_eGeomEncoding == OGRArrowGeomEncoding::WKB && eGType != wkbNone)
     472             :     {
     473         101 :         m_oWriterPropertiesBuilder.disable_statistics(
     474         303 :             parquet::schema::ColumnPath::FromDotString(
     475         101 :                 m_poFeatureDefn->GetGeomFieldDefn(0)->GetNameRef()));
     476             :     }
     477             : 
     478             :     const char *pszRowGroupSize =
     479         262 :         CSLFetchNameValue(papszOptions, "ROW_GROUP_SIZE");
     480         262 :     if (pszRowGroupSize)
     481             :     {
     482           5 :         auto nRowGroupSize = static_cast<int64_t>(atoll(pszRowGroupSize));
     483           5 :         if (nRowGroupSize > 0)
     484             :         {
     485           5 :             if (nRowGroupSize > INT_MAX)
     486           0 :                 nRowGroupSize = INT_MAX;
     487           5 :             m_nRowGroupSize = nRowGroupSize;
     488             :         }
     489             :     }
     490             : 
     491         262 :     m_bEdgesSpherical = EQUAL(
     492             :         CSLFetchNameValueDef(papszOptions, "EDGES", "PLANAR"), "SPHERICAL");
     493             : 
     494         262 :     m_bInitializationOK = true;
     495         262 :     return true;
     496             : }
     497             : 
     498             : /************************************************************************/
     499             : /*                         CloseFileWriter()                            */
     500             : /************************************************************************/
     501             : 
     502         262 : bool OGRParquetWriterLayer::CloseFileWriter()
     503             : {
     504         524 :     auto status = m_poFileWriter->Close();
     505         262 :     if (!status.ok())
     506             :     {
     507           0 :         CPLError(CE_Failure, CPLE_AppDefined,
     508             :                  "FileWriter::Close() failed with %s",
     509           0 :                  status.message().c_str());
     510             :     }
     511         524 :     return status.ok();
     512             : }
     513             : 
     514             : /************************************************************************/
     515             : /*                            IdentifyCRS()                             */
     516             : /************************************************************************/
     517             : 
     518          24 : static OGRSpatialReference IdentifyCRS(const OGRSpatialReference *poSRS)
     519             : {
     520          24 :     OGRSpatialReference oSRSIdentified(*poSRS);
     521             : 
     522          24 :     if (poSRS->GetAuthorityName(nullptr) == nullptr)
     523             :     {
     524             :         // Try to find a registered CRS that matches the input one
     525           4 :         int nEntries = 0;
     526           4 :         int *panConfidence = nullptr;
     527             :         OGRSpatialReferenceH *pahSRS =
     528           4 :             poSRS->FindMatches(nullptr, &nEntries, &panConfidence);
     529             : 
     530             :         // If there are several matches >= 90%, take the only one
     531             :         // that is EPSG
     532           4 :         int iOtherAuthority = -1;
     533           4 :         int iEPSG = -1;
     534           4 :         const char *const apszOptions[] = {
     535             :             "IGNORE_DATA_AXIS_TO_SRS_AXIS_MAPPING=YES", nullptr};
     536           4 :         int iConfidenceBestMatch = -1;
     537           6 :         for (int iSRS = 0; iSRS < nEntries; iSRS++)
     538             :         {
     539           4 :             auto poCandidateCRS = OGRSpatialReference::FromHandle(pahSRS[iSRS]);
     540           4 :             if (panConfidence[iSRS] < iConfidenceBestMatch ||
     541           4 :                 panConfidence[iSRS] < 70)
     542             :             {
     543             :                 break;
     544             :             }
     545           3 :             if (poSRS->IsSame(poCandidateCRS, apszOptions))
     546             :             {
     547             :                 const char *pszAuthName =
     548           3 :                     poCandidateCRS->GetAuthorityName(nullptr);
     549           3 :                 if (pszAuthName != nullptr && EQUAL(pszAuthName, "EPSG"))
     550             :                 {
     551           2 :                     iOtherAuthority = -2;
     552           2 :                     if (iEPSG < 0)
     553             :                     {
     554           2 :                         iConfidenceBestMatch = panConfidence[iSRS];
     555           2 :                         iEPSG = iSRS;
     556             :                     }
     557             :                     else
     558             :                     {
     559           0 :                         iEPSG = -1;
     560           0 :                         break;
     561             :                     }
     562             :                 }
     563           1 :                 else if (iEPSG < 0 && pszAuthName != nullptr)
     564             :                 {
     565           1 :                     if (EQUAL(pszAuthName, "OGC"))
     566             :                     {
     567             :                         const char *pszAuthCode =
     568           1 :                             poCandidateCRS->GetAuthorityCode(nullptr);
     569           1 :                         if (pszAuthCode && EQUAL(pszAuthCode, "CRS84"))
     570             :                         {
     571           1 :                             iOtherAuthority = iSRS;
     572           1 :                             break;
     573             :                         }
     574             :                     }
     575           0 :                     else if (iOtherAuthority == -1)
     576             :                     {
     577           0 :                         iConfidenceBestMatch = panConfidence[iSRS];
     578           0 :                         iOtherAuthority = iSRS;
     579             :                     }
     580             :                     else
     581           0 :                         iOtherAuthority = -2;
     582             :                 }
     583             :             }
     584             :         }
     585           4 :         if (iEPSG >= 0)
     586             :         {
     587           2 :             oSRSIdentified = *OGRSpatialReference::FromHandle(pahSRS[iEPSG]);
     588             :         }
     589           2 :         else if (iOtherAuthority >= 0)
     590             :         {
     591             :             oSRSIdentified =
     592           1 :                 *OGRSpatialReference::FromHandle(pahSRS[iOtherAuthority]);
     593             :         }
     594           4 :         OSRFreeSRSArray(pahSRS);
     595           4 :         CPLFree(panConfidence);
     596             :     }
     597             : 
     598          24 :     return oSRSIdentified;
     599             : }
     600             : 
     601             : /************************************************************************/
     602             : /*                      RemoveIDFromMemberOfEnsembles()                 */
     603             : /************************************************************************/
     604             : 
     605         314 : static void RemoveIDFromMemberOfEnsembles(CPLJSONObject &obj)
     606             : {
     607             :     // Remove "id" from members of datum ensembles for compatibility with
     608             :     // older PROJ versions
     609             :     // Cf https://github.com/opengeospatial/geoparquet/discussions/110
     610             :     // and https://github.com/OSGeo/PROJ/pull/3221
     611         314 :     if (obj.GetType() == CPLJSONObject::Type::Object)
     612             :     {
     613         398 :         for (auto &subObj : obj.GetChildren())
     614             :         {
     615         304 :             RemoveIDFromMemberOfEnsembles(subObj);
     616             :         }
     617             :     }
     618         242 :     else if (obj.GetType() == CPLJSONObject::Type::Array &&
     619         242 :              obj.GetName() == "members")
     620             :     {
     621           0 :         for (auto &subObj : obj.ToArray())
     622             :         {
     623           0 :             subObj.Delete("id");
     624             :         }
     625             :     }
     626         314 : }
     627             : 
     628             : /************************************************************************/
     629             : /*                            GetGeoMetadata()                          */
     630             : /************************************************************************/
     631             : 
     632         262 : std::string OGRParquetWriterLayer::GetGeoMetadata() const
     633             : {
     634             :     // Just for unit testing purposes
     635             :     const char *pszGeoMetadata =
     636         262 :         CPLGetConfigOption("OGR_PARQUET_GEO_METADATA", nullptr);
     637         262 :     if (pszGeoMetadata)
     638          16 :         return pszGeoMetadata;
     639             : 
     640         481 :     if (m_poFeatureDefn->GetGeomFieldCount() != 0 &&
     641         235 :         CPLTestBool(CPLGetConfigOption("OGR_PARQUET_WRITE_GEO", "YES")))
     642             :     {
     643         468 :         CPLJSONObject oRoot;
     644         234 :         oRoot.Add("version", "1.1.0");
     645         234 :         oRoot.Add("primary_column",
     646         234 :                   m_poFeatureDefn->GetGeomFieldDefn(0)->GetNameRef());
     647         468 :         CPLJSONObject oColumns;
     648         234 :         oRoot.Add("columns", oColumns);
     649         485 :         for (int i = 0; i < m_poFeatureDefn->GetGeomFieldCount(); ++i)
     650             :         {
     651         251 :             const auto poGeomFieldDefn = m_poFeatureDefn->GetGeomFieldDefn(i);
     652         502 :             CPLJSONObject oColumn;
     653         251 :             oColumns.Add(poGeomFieldDefn->GetNameRef(), oColumn);
     654         251 :             oColumn.Add("encoding",
     655         251 :                         GetGeomEncodingAsString(m_aeGeomEncoding[i], true));
     656             : 
     657         251 :             if (CPLTestBool(CPLGetConfigOption("OGR_PARQUET_WRITE_CRS", "YES")))
     658             :             {
     659         250 :                 const auto poSRS = poGeomFieldDefn->GetSpatialRef();
     660         250 :                 if (poSRS)
     661             :                 {
     662          48 :                     OGRSpatialReference oSRSIdentified(IdentifyCRS(poSRS));
     663             : 
     664             :                     const char *pszAuthName =
     665          24 :                         oSRSIdentified.GetAuthorityName(nullptr);
     666             :                     const char *pszAuthCode =
     667          24 :                         oSRSIdentified.GetAuthorityCode(nullptr);
     668             : 
     669          24 :                     bool bOmitCRS = false;
     670          24 :                     if (pszAuthName != nullptr && pszAuthCode != nullptr &&
     671          23 :                         ((EQUAL(pszAuthName, "EPSG") &&
     672          20 :                           EQUAL(pszAuthCode, "4326")) ||
     673          12 :                          (EQUAL(pszAuthName, "OGC") &&
     674           3 :                           EQUAL(pszAuthCode, "CRS84"))))
     675             :                     {
     676             :                         // To make things less confusing for non-geo-aware
     677             :                         // consumers, omit EPSG:4326 / OGC:CRS84 CRS by default
     678          14 :                         bOmitCRS = CPLTestBool(CPLGetConfigOption(
     679             :                             "OGR_PARQUET_CRS_OMIT_IF_WGS84", "YES"));
     680             :                     }
     681             : 
     682          24 :                     if (bOmitCRS)
     683             :                     {
     684             :                         // do nothing
     685             :                     }
     686          10 :                     else if (EQUAL(CPLGetConfigOption(
     687             :                                        "OGR_PARQUET_CRS_ENCODING", "PROJJSON"),
     688             :                                    "PROJJSON"))
     689             :                     {
     690             :                         // CRS encoded as PROJJSON for GeoParquet >= 0.4.0
     691          10 :                         char *pszPROJJSON = nullptr;
     692          10 :                         oSRSIdentified.exportToPROJJSON(&pszPROJJSON, nullptr);
     693          20 :                         CPLJSONDocument oCRSDoc;
     694          10 :                         CPL_IGNORE_RET_VAL(oCRSDoc.LoadMemory(pszPROJJSON));
     695          10 :                         CPLFree(pszPROJJSON);
     696          10 :                         CPLJSONObject oCRSRoot = oCRSDoc.GetRoot();
     697          10 :                         RemoveIDFromMemberOfEnsembles(oCRSRoot);
     698          10 :                         oColumn.Add("crs", oCRSRoot);
     699             :                     }
     700             :                     else
     701             :                     {
     702             :                         // WKT was used in GeoParquet <= 0.3.0
     703           0 :                         const char *const apszOptions[] = {
     704             :                             "FORMAT=WKT2_2019", "MULTILINE=NO", nullptr};
     705           0 :                         char *pszWKT = nullptr;
     706           0 :                         oSRSIdentified.exportToWkt(&pszWKT, apszOptions);
     707           0 :                         if (pszWKT)
     708           0 :                             oColumn.Add("crs", pszWKT);
     709           0 :                         CPLFree(pszWKT);
     710             :                     }
     711             : 
     712          24 :                     const double dfCoordEpoch = poSRS->GetCoordinateEpoch();
     713          24 :                     if (dfCoordEpoch > 0)
     714           2 :                         oColumn.Add("epoch", dfCoordEpoch);
     715             :                 }
     716             :                 else
     717             :                 {
     718         226 :                     oColumn.AddNull("crs");
     719             :                 }
     720             :             }
     721             : 
     722         251 :             if (m_bEdgesSpherical)
     723             :             {
     724           1 :                 oColumn.Add("edges", "spherical");
     725             :             }
     726             : 
     727         475 :             if (m_aoEnvelopes[i].IsInit() &&
     728         224 :                 CPLTestBool(
     729             :                     CPLGetConfigOption("OGR_PARQUET_WRITE_BBOX", "YES")))
     730             :             {
     731         224 :                 bool bHasZ = false;
     732         407 :                 for (const auto eGeomType : m_oSetWrittenGeometryTypes[i])
     733             :                 {
     734         266 :                     bHasZ = OGR_GT_HasZ(eGeomType);
     735         266 :                     if (bHasZ)
     736          83 :                         break;
     737             :                 }
     738         224 :                 CPLJSONArray oBBOX;
     739         224 :                 oBBOX.Add(m_aoEnvelopes[i].MinX);
     740         224 :                 oBBOX.Add(m_aoEnvelopes[i].MinY);
     741         224 :                 if (bHasZ)
     742          83 :                     oBBOX.Add(m_aoEnvelopes[i].MinZ);
     743         224 :                 oBBOX.Add(m_aoEnvelopes[i].MaxX);
     744         224 :                 oBBOX.Add(m_aoEnvelopes[i].MaxY);
     745         224 :                 if (bHasZ)
     746          83 :                     oBBOX.Add(m_aoEnvelopes[i].MaxZ);
     747         224 :                 oColumn.Add("bbox", oBBOX);
     748             :             }
     749             : 
     750             :             // Bounding box column definition
     751         432 :             if (m_bWriteBBoxStruct &&
     752         181 :                 CPLTestBool(CPLGetConfigOption(
     753             :                     "OGR_PARQUET_WRITE_COVERING_BBOX_IN_METADATA", "YES")))
     754             :             {
     755         362 :                 CPLJSONObject oCovering;
     756         181 :                 oColumn.Add("covering", oCovering);
     757         362 :                 CPLJSONObject oBBOX;
     758         181 :                 oCovering.Add("bbox", oBBOX);
     759             :                 const auto AddComponent =
     760        2172 :                     [this, i, &oBBOX](const char *pszComponent)
     761             :                 {
     762         724 :                     CPLJSONArray oArray;
     763         724 :                     oArray.Add(m_apoFieldsBBOX[i]->name());
     764         724 :                     oArray.Add(pszComponent);
     765         724 :                     oBBOX.Add(pszComponent, oArray);
     766         724 :                 };
     767         181 :                 AddComponent("xmin");
     768         181 :                 AddComponent("ymin");
     769         181 :                 AddComponent("xmax");
     770         181 :                 AddComponent("ymax");
     771             :             }
     772             : 
     773         282 :             const auto GetStringGeometryType = [](OGRwkbGeometryType eType)
     774             :             {
     775         282 :                 const auto eFlattenType = wkbFlatten(eType);
     776         282 :                 std::string osType = "Unknown";
     777         282 :                 if (wkbPoint == eFlattenType)
     778          66 :                     osType = "Point";
     779         216 :                 else if (wkbLineString == eFlattenType)
     780          34 :                     osType = "LineString";
     781         182 :                 else if (wkbPolygon == eFlattenType)
     782          53 :                     osType = "Polygon";
     783         129 :                 else if (wkbMultiPoint == eFlattenType)
     784          26 :                     osType = "MultiPoint";
     785         103 :                 else if (wkbMultiLineString == eFlattenType)
     786          29 :                     osType = "MultiLineString";
     787          74 :                 else if (wkbMultiPolygon == eFlattenType)
     788          69 :                     osType = "MultiPolygon";
     789           5 :                 else if (wkbGeometryCollection == eFlattenType)
     790           5 :                     osType = "GeometryCollection";
     791         282 :                 if (osType != "Unknown")
     792             :                 {
     793             :                     // M and ZM not supported officially currently, but it
     794             :                     // doesn't hurt to anticipate
     795         282 :                     if (OGR_GT_HasZ(eType) && OGR_GT_HasM(eType))
     796           8 :                         osType += " ZM";
     797         274 :                     else if (OGR_GT_HasZ(eType))
     798          91 :                         osType += " Z";
     799         183 :                     else if (OGR_GT_HasM(eType))
     800           8 :                         osType += " M";
     801             :                 }
     802         282 :                 return osType;
     803             :             };
     804             : 
     805         251 :             if (m_bForceCounterClockwiseOrientation)
     806         250 :                 oColumn.Add("orientation", "counterclockwise");
     807             : 
     808         251 :             CPLJSONArray oArray;
     809         533 :             for (const auto eType : m_oSetWrittenGeometryTypes[i])
     810             :             {
     811         282 :                 oArray.Add(GetStringGeometryType(eType));
     812             :             }
     813         251 :             oColumn.Add("geometry_types", oArray);
     814             :         }
     815             : 
     816         234 :         return oRoot.Format(CPLJSONObject::PrettyFormat::Plain);
     817             :     }
     818          12 :     return std::string();
     819             : }
     820             : 
     821             : /************************************************************************/
     822             : /*               PerformStepsBeforeFinalFlushGroup()                    */
     823             : /************************************************************************/
     824             : 
     825         262 : void OGRParquetWriterLayer::PerformStepsBeforeFinalFlushGroup()
     826             : {
     827         262 :     if (m_poKeyValueMetadata)
     828             :     {
     829         524 :         const std::string osGeoMetadata = GetGeoMetadata();
     830         524 :         auto poTmpSchema = m_poSchema;
     831         262 :         if (!osGeoMetadata.empty())
     832             :         {
     833             :             // HACK: it would be good for Arrow to provide a clean way to alter
     834             :             // key value metadata before finalizing.
     835             :             // We need to write metadata at end to write the bounding box.
     836         250 :             const_cast<arrow::KeyValueMetadata *>(m_poKeyValueMetadata.get())
     837         250 :                 ->Append("geo", osGeoMetadata);
     838             : 
     839         250 :             auto kvMetadata = poTmpSchema->metadata()
     840           9 :                                   ? poTmpSchema->metadata()->Copy()
     841         259 :                                   : std::make_shared<arrow::KeyValueMetadata>();
     842         250 :             kvMetadata->Append("geo", osGeoMetadata);
     843         250 :             poTmpSchema = poTmpSchema->WithMetadata(kvMetadata);
     844             :         }
     845             : 
     846         262 :         if (CPLTestBool(
     847             :                 CPLGetConfigOption("OGR_PARQUET_WRITE_ARROW_SCHEMA", "YES")))
     848             :         {
     849             :             auto status =
     850         524 :                 ::arrow::ipc::SerializeSchema(*poTmpSchema, m_poMemoryPool);
     851         262 :             if (status.ok())
     852             :             {
     853             :                 // The serialized schema is not UTF-8, which is required for
     854             :                 // Thrift
     855         524 :                 const std::string schema_as_string = (*status)->ToString();
     856             :                 const std::string schema_base64 =
     857         262 :                     ::arrow::util::base64_encode(schema_as_string);
     858         262 :                 static const std::string kArrowSchemaKey = "ARROW:schema";
     859             :                 const_cast<arrow::KeyValueMetadata *>(
     860         262 :                     m_poKeyValueMetadata.get())
     861         262 :                     ->Append(kArrowSchemaKey, schema_base64);
     862             :             }
     863             :         }
     864             : 
     865             :         // Put GDAL metadata into a gdal:metadata domain
     866         524 :         CPLJSONObject oMultiMetadata;
     867         262 :         bool bHasMultiMetadata = false;
     868         266 :         auto &l_oMDMD = oMDMD.GetDomainList() && *(oMDMD.GetDomainList())
     869         266 :                             ? oMDMD
     870         258 :                             : m_poDataset->GetMultiDomainMetadata();
     871         268 :         for (CSLConstList papszDomainIter = l_oMDMD.GetDomainList();
     872         268 :              papszDomainIter && *papszDomainIter; ++papszDomainIter)
     873             :         {
     874           6 :             const char *pszDomain = *papszDomainIter;
     875           6 :             CSLConstList papszMD = l_oMDMD.GetMetadata(pszDomain);
     876           6 :             if (STARTS_WITH(pszDomain, "json:") && papszMD && papszMD[0])
     877             :             {
     878           1 :                 CPLJSONDocument oDoc;
     879           1 :                 if (oDoc.LoadMemory(papszMD[0]))
     880             :                 {
     881           1 :                     bHasMultiMetadata = true;
     882           1 :                     oMultiMetadata.Add(pszDomain, oDoc.GetRoot());
     883           1 :                     continue;
     884           0 :                 }
     885             :             }
     886           5 :             else if (STARTS_WITH(pszDomain, "xml:") && papszMD && papszMD[0])
     887             :             {
     888           1 :                 bHasMultiMetadata = true;
     889           1 :                 oMultiMetadata.Add(pszDomain, papszMD[0]);
     890           1 :                 continue;
     891             :             }
     892           8 :             CPLJSONObject oMetadata;
     893           4 :             bool bHasMetadata = false;
     894           8 :             for (CSLConstList papszMDIter = papszMD;
     895           8 :                  papszMDIter && *papszMDIter; ++papszMDIter)
     896             :             {
     897           4 :                 char *pszKey = nullptr;
     898           4 :                 const char *pszValue = CPLParseNameValue(*papszMDIter, &pszKey);
     899           4 :                 if (pszKey && pszValue)
     900             :                 {
     901           4 :                     bHasMetadata = true;
     902           4 :                     bHasMultiMetadata = true;
     903           4 :                     oMetadata.Add(pszKey, pszValue);
     904             :                 }
     905           4 :                 CPLFree(pszKey);
     906             :             }
     907           4 :             if (bHasMetadata)
     908           4 :                 oMultiMetadata.Add(pszDomain, oMetadata);
     909             :         }
     910         262 :         if (bHasMultiMetadata)
     911             :         {
     912           4 :             const_cast<arrow::KeyValueMetadata *>(m_poKeyValueMetadata.get())
     913           4 :                 ->Append(
     914             :                     "gdal:metadata",
     915           8 :                     oMultiMetadata.Format(CPLJSONObject::PrettyFormat::Plain));
     916             :         }
     917             :     }
     918         262 : }
     919             : 
     920             : /************************************************************************/
     921             : /*                                 Open()                               */
     922             : /************************************************************************/
     923             : 
     924             : // Same as parquet::arrow::FileWriter::Open(), except we also
     925             : // return KeyValueMetadata
     926             : static arrow::Status
     927         262 : Open(const ::arrow::Schema &schema, ::arrow::MemoryPool *pool,
     928             :      std::shared_ptr<::arrow::io::OutputStream> sink,
     929             :      std::shared_ptr<parquet::WriterProperties> properties,
     930             :      std::shared_ptr<parquet::ArrowWriterProperties> arrow_properties,
     931             :      std::unique_ptr<parquet::arrow::FileWriter> *writer,
     932             :      std::shared_ptr<const arrow::KeyValueMetadata> *outMetadata)
     933             : {
     934         262 :     std::shared_ptr<parquet::SchemaDescriptor> parquet_schema;
     935         524 :     RETURN_NOT_OK(parquet::arrow::ToParquetSchema(
     936             :         &schema, *properties, *arrow_properties, &parquet_schema));
     937             : 
     938             :     auto schema_node = std::static_pointer_cast<parquet::schema::GroupNode>(
     939         524 :         parquet_schema->schema_root());
     940             : 
     941         262 :     auto metadata = schema.metadata()
     942          14 :                         ? schema.metadata()->Copy()
     943         538 :                         : std::make_shared<arrow::KeyValueMetadata>();
     944         262 :     *outMetadata = metadata;
     945             : 
     946         262 :     std::unique_ptr<parquet::ParquetFileWriter> base_writer;
     947         262 :     PARQUET_CATCH_NOT_OK(base_writer = parquet::ParquetFileWriter::Open(
     948             :                              std::move(sink), std::move(schema_node),
     949             :                              std::move(properties), metadata));
     950             : 
     951         262 :     auto schema_ptr = std::make_shared<::arrow::Schema>(schema);
     952             :     return parquet::arrow::FileWriter::Make(
     953         524 :         pool, std::move(base_writer), std::move(schema_ptr),
     954         786 :         std::move(arrow_properties), writer);
     955             : }
     956             : 
     957             : /************************************************************************/
     958             : /*                          CreateSchema()                              */
     959             : /************************************************************************/
     960             : 
     961         262 : void OGRParquetWriterLayer::CreateSchema()
     962             : {
     963         262 :     CreateSchemaCommon();
     964         262 : }
     965             : 
     966             : /************************************************************************/
     967             : /*                          CreateGeomField()                           */
     968             : /************************************************************************/
     969             : 
     970          27 : OGRErr OGRParquetWriterLayer::CreateGeomField(const OGRGeomFieldDefn *poField,
     971             :                                               int bApproxOK)
     972             : {
     973          27 :     OGRErr eErr = OGRArrowWriterLayer::CreateGeomField(poField, bApproxOK);
     974          53 :     if (eErr == OGRERR_NONE &&
     975          26 :         m_aeGeomEncoding.back() == OGRArrowGeomEncoding::WKB)
     976             :     {
     977           2 :         m_oWriterPropertiesBuilder.disable_statistics(
     978           6 :             parquet::schema::ColumnPath::FromDotString(
     979           2 :                 m_poFeatureDefn
     980           2 :                     ->GetGeomFieldDefn(m_poFeatureDefn->GetGeomFieldCount() - 1)
     981             :                     ->GetNameRef()));
     982             :     }
     983          27 :     return eErr;
     984             : }
     985             : 
     986             : /************************************************************************/
     987             : /*                          CreateWriter()                              */
     988             : /************************************************************************/
     989             : 
     990         262 : void OGRParquetWriterLayer::CreateWriter()
     991             : {
     992         262 :     CPLAssert(m_poFileWriter == nullptr);
     993             : 
     994         262 :     if (m_poSchema == nullptr)
     995             :     {
     996          40 :         CreateSchema();
     997             :     }
     998             :     else
     999             :     {
    1000         222 :         FinalizeSchema();
    1001             :     }
    1002             : 
    1003             :     auto arrowWriterProperties =
    1004         262 :         parquet::ArrowWriterProperties::Builder().store_schema()->build();
    1005         786 :     CPL_IGNORE_RET_VAL(Open(*m_poSchema, m_poMemoryPool, m_poOutputStream,
    1006         524 :                             m_oWriterPropertiesBuilder.build(),
    1007         262 :                             std::move(arrowWriterProperties), &m_poFileWriter,
    1008             :                             &m_poKeyValueMetadata));
    1009         262 : }
    1010             : 
    1011             : /************************************************************************/
    1012             : /*                          ICreateFeature()                            */
    1013             : /************************************************************************/
    1014             : 
    1015        3066 : OGRErr OGRParquetWriterLayer::ICreateFeature(OGRFeature *poFeature)
    1016             : {
    1017             :     // If not using SORT_BY_BBOX=YES layer creation option, we can directly
    1018             :     // write features to the final Parquet file
    1019        3066 :     if (!m_poTmpGPKGLayer)
    1020         862 :         return OGRArrowWriterLayer::ICreateFeature(poFeature);
    1021             : 
    1022             :     // SORT_BY_BBOX=YES case: we write for now a serialized version of poFeature
    1023             :     // in a temporary GeoPackage file.
    1024             : 
    1025        2204 :     GIntBig nFID = poFeature->GetFID();
    1026        2204 :     if (!m_osFIDColumn.empty() && nFID == OGRNullFID)
    1027             :     {
    1028        1102 :         nFID = m_nTmpFeatureCount;
    1029        1102 :         poFeature->SetFID(nFID);
    1030             :     }
    1031        2204 :     ++m_nTmpFeatureCount;
    1032             : 
    1033        4408 :     std::vector<GByte> abyBuffer;
    1034             :     // Serialize the source feature as a single array of bytes to preserve it
    1035             :     // fully
    1036        2204 :     if (!poFeature->SerializeToBinary(abyBuffer))
    1037             :     {
    1038           0 :         return OGRERR_FAILURE;
    1039             :     }
    1040             : 
    1041             :     // SQLite3 limitation: a row must fit in slightly less than 1 GB.
    1042        2204 :     constexpr int SOME_MARGIN = 128;
    1043        2204 :     if (abyBuffer.size() > 1024 * 1024 * 1024 - SOME_MARGIN)
    1044             :     {
    1045           0 :         CPLError(CE_Failure, CPLE_NotSupported,
    1046             :                  "Features larger than 1 GB are not supported");
    1047           0 :         return OGRERR_FAILURE;
    1048             :     }
    1049             : 
    1050        4408 :     OGRFeature oFeat(m_poTmpGPKGLayer->GetLayerDefn());
    1051        2204 :     oFeat.SetFID(nFID);
    1052        2204 :     oFeat.SetField(0, static_cast<int>(abyBuffer.size()), abyBuffer.data());
    1053        2204 :     const auto poSrcGeom = poFeature->GetGeometryRef();
    1054        2204 :     if (poSrcGeom && !poSrcGeom->IsEmpty())
    1055             :     {
    1056             :         // For the purpose of building an RTree, just use the bounding box of
    1057             :         // the geometry as the geometry.
    1058        1202 :         OGREnvelope sEnvelope;
    1059        1202 :         poSrcGeom->getEnvelope(&sEnvelope);
    1060        2404 :         auto poPoly = std::make_unique<OGRPolygon>();
    1061        2404 :         auto poLR = std::make_unique<OGRLinearRing>();
    1062        1202 :         poLR->addPoint(sEnvelope.MinX, sEnvelope.MinY);
    1063        1202 :         poLR->addPoint(sEnvelope.MinX, sEnvelope.MaxY);
    1064        1202 :         poLR->addPoint(sEnvelope.MaxX, sEnvelope.MaxY);
    1065        1202 :         poLR->addPoint(sEnvelope.MaxX, sEnvelope.MinY);
    1066        1202 :         poLR->addPoint(sEnvelope.MinX, sEnvelope.MinY);
    1067        1202 :         poPoly->addRingDirectly(poLR.release());
    1068        1202 :         oFeat.SetGeometryDirectly(poPoly.release());
    1069             :     }
    1070        2204 :     return m_poTmpGPKGLayer->CreateFeature(&oFeat);
    1071             : }
    1072             : 
    1073             : /************************************************************************/
    1074             : /*                            FlushGroup()                              */
    1075             : /************************************************************************/
    1076             : 
    1077         246 : bool OGRParquetWriterLayer::FlushGroup()
    1078             : {
    1079         492 :     auto status = m_poFileWriter->NewRowGroup(m_apoBuilders[0]->length());
    1080         246 :     if (!status.ok())
    1081             :     {
    1082           0 :         CPLError(CE_Failure, CPLE_AppDefined, "NewRowGroup() failed with %s",
    1083           0 :                  status.message().c_str());
    1084           0 :         ClearArrayBuilers();
    1085           0 :         return false;
    1086             :     }
    1087             : 
    1088         246 :     auto ret = WriteArrays(
    1089         995 :         [this](const std::shared_ptr<arrow::Field> &field,
    1090         995 :                const std::shared_ptr<arrow::Array> &array)
    1091             :         {
    1092        1990 :             auto l_status = m_poFileWriter->WriteColumnChunk(*array);
    1093         995 :             if (!l_status.ok())
    1094             :             {
    1095           0 :                 CPLError(CE_Failure, CPLE_AppDefined,
    1096             :                          "WriteColumnChunk() failed for field %s: %s",
    1097           0 :                          field->name().c_str(), l_status.message().c_str());
    1098           0 :                 return false;
    1099             :             }
    1100         995 :             return true;
    1101             :         });
    1102             : 
    1103         246 :     ClearArrayBuilers();
    1104         246 :     return ret;
    1105             : }
    1106             : 
    1107             : /************************************************************************/
    1108             : /*                    FixupWKBGeometryBeforeWriting()                   */
    1109             : /************************************************************************/
    1110             : 
    1111          43 : void OGRParquetWriterLayer::FixupWKBGeometryBeforeWriting(GByte *pabyWkb,
    1112             :                                                           size_t nLen)
    1113             : {
    1114          43 :     if (!m_bForceCounterClockwiseOrientation)
    1115           0 :         return;
    1116             : 
    1117          43 :     OGRWKBFixupCounterClockWiseExternalRing(pabyWkb, nLen);
    1118             : }
    1119             : 
    1120             : /************************************************************************/
    1121             : /*                     FixupGeometryBeforeWriting()                     */
    1122             : /************************************************************************/
    1123             : 
    1124        1334 : void OGRParquetWriterLayer::FixupGeometryBeforeWriting(OGRGeometry *poGeom)
    1125             : {
    1126        1334 :     if (!m_bForceCounterClockwiseOrientation)
    1127           3 :         return;
    1128             : 
    1129        1331 :     const auto eFlattenType = wkbFlatten(poGeom->getGeometryType());
    1130             :     // Polygon rings MUST follow the right-hand rule for orientation
    1131             :     // (counterclockwise external rings, clockwise internal rings)
    1132        1331 :     if (eFlattenType == wkbPolygon)
    1133             :     {
    1134          44 :         bool bFirstRing = true;
    1135          91 :         for (auto poRing : poGeom->toPolygon())
    1136             :         {
    1137          55 :             if ((bFirstRing && poRing->isClockwise()) ||
    1138           8 :                 (!bFirstRing && !poRing->isClockwise()))
    1139             :             {
    1140          42 :                 poRing->reversePoints();
    1141             :             }
    1142          47 :             bFirstRing = false;
    1143             :         }
    1144             :     }
    1145        1287 :     else if (eFlattenType == wkbMultiPolygon ||
    1146             :              eFlattenType == wkbGeometryCollection)
    1147             :     {
    1148          35 :         for (auto poSubGeom : poGeom->toGeometryCollection())
    1149             :         {
    1150          21 :             FixupGeometryBeforeWriting(poSubGeom);
    1151             :         }
    1152             :     }
    1153             : }
    1154             : 
    1155             : /************************************************************************/
    1156             : /*                          WriteArrowBatch()                           */
    1157             : /************************************************************************/
    1158             : 
    1159             : #if PARQUET_VERSION_MAJOR > 10
    1160             : inline bool
    1161          14 : OGRParquetWriterLayer::WriteArrowBatch(const struct ArrowSchema *schema,
    1162             :                                        struct ArrowArray *array,
    1163             :                                        CSLConstList papszOptions)
    1164             : {
    1165          14 :     if (m_poTmpGPKGLayer)
    1166             :     {
    1167             :         // When using SORT_BY_BBOX=YES option, we can't directly write the
    1168             :         // input array, because we need to sort features. Hence we fallback
    1169             :         // to the OGRLayer base implementation, which will ultimately call
    1170             :         // OGRParquetWriterLayer::ICreateFeature()
    1171           0 :         return OGRLayer::WriteArrowBatch(schema, array, papszOptions);
    1172             :     }
    1173             : 
    1174          28 :     return WriteArrowBatchInternal(
    1175             :         schema, array, papszOptions,
    1176          28 :         [this](const std::shared_ptr<arrow::RecordBatch> &poBatch)
    1177             :         {
    1178          28 :             auto status = m_poFileWriter->NewBufferedRowGroup();
    1179          14 :             if (!status.ok())
    1180             :             {
    1181           0 :                 CPLError(CE_Failure, CPLE_AppDefined,
    1182             :                          "NewBufferedRowGroup() failed with %s",
    1183           0 :                          status.message().c_str());
    1184           0 :                 return false;
    1185             :             }
    1186             : 
    1187          14 :             status = m_poFileWriter->WriteRecordBatch(*poBatch);
    1188          14 :             if (!status.ok())
    1189             :             {
    1190           0 :                 CPLError(CE_Failure, CPLE_AppDefined,
    1191             :                          "WriteRecordBatch() failed: %s",
    1192           0 :                          status.message().c_str());
    1193           0 :                 return false;
    1194             :             }
    1195             : 
    1196          14 :             return true;
    1197          14 :         });
    1198             : }
    1199             : #endif
    1200             : 
    1201             : /************************************************************************/
    1202             : /*                         TestCapability()                             */
    1203             : /************************************************************************/
    1204             : 
    1205         475 : inline int OGRParquetWriterLayer::TestCapability(const char *pszCap)
    1206             : {
    1207             : #if PARQUET_VERSION_MAJOR <= 10
    1208             :     if (EQUAL(pszCap, OLCFastWriteArrowBatch))
    1209             :         return false;
    1210             : #endif
    1211             : 
    1212         475 :     if (m_poTmpGPKGLayer && EQUAL(pszCap, OLCFastWriteArrowBatch))
    1213             :     {
    1214             :         // When using SORT_BY_BBOX=YES option, we can't directly write the
    1215             :         // input array, because we need to sort features. So this is not
    1216             :         // fast
    1217           1 :         return false;
    1218             :     }
    1219             : 
    1220         474 :     return OGRArrowWriterLayer::TestCapability(pszCap);
    1221             : }
    1222             : 
    1223             : /************************************************************************/
    1224             : /*                        CreateFieldFromArrowSchema()                  */
    1225             : /************************************************************************/
    1226             : 
    1227             : #if PARQUET_VERSION_MAJOR > 10
    1228         396 : bool OGRParquetWriterLayer::CreateFieldFromArrowSchema(
    1229             :     const struct ArrowSchema *schema, CSLConstList papszOptions)
    1230             : {
    1231         396 :     if (m_poTmpGPKGLayer)
    1232             :     {
    1233             :         // When using SORT_BY_BBOX=YES option, we can't directly write the
    1234             :         // input array, because we need to sort features. But this process
    1235             :         // only supports the base Arrow types supported by
    1236             :         // OGRLayer::WriteArrowBatch()
    1237           0 :         return OGRLayer::CreateFieldFromArrowSchema(schema, papszOptions);
    1238             :     }
    1239             : 
    1240         396 :     return OGRArrowWriterLayer::CreateFieldFromArrowSchema(schema,
    1241         396 :                                                            papszOptions);
    1242             : }
    1243             : #endif
    1244             : 
    1245             : /************************************************************************/
    1246             : /*                        IsArrowSchemaSupported()                      */
    1247             : /************************************************************************/
    1248             : 
    1249             : #if PARQUET_VERSION_MAJOR > 10
    1250        1077 : bool OGRParquetWriterLayer::IsArrowSchemaSupported(
    1251             :     const struct ArrowSchema *schema, CSLConstList papszOptions,
    1252             :     std::string &osErrorMsg) const
    1253             : {
    1254        1077 :     if (m_poTmpGPKGLayer)
    1255             :     {
    1256             :         // When using SORT_BY_BBOX=YES option, we can't directly write the
    1257             :         // input array, because we need to sort features. But this process
    1258             :         // only supports the base Arrow types supported by
    1259             :         // OGRLayer::WriteArrowBatch()
    1260           0 :         return OGRLayer::IsArrowSchemaSupported(schema, papszOptions,
    1261           0 :                                                 osErrorMsg);
    1262             :     }
    1263             : 
    1264        1077 :     if (schema->format[0] == 'e' && schema->format[1] == 0)
    1265             :     {
    1266           1 :         osErrorMsg = "float16 not supported";
    1267           1 :         return false;
    1268             :     }
    1269        1076 :     if (schema->format[0] == 'v' && schema->format[1] == 'u')
    1270             :     {
    1271           1 :         osErrorMsg = "StringView not supported";
    1272           1 :         return false;
    1273             :     }
    1274        1075 :     if (schema->format[0] == 'v' && schema->format[1] == 'z')
    1275             :     {
    1276           1 :         osErrorMsg = "BinaryView not supported";
    1277           1 :         return false;
    1278             :     }
    1279        1074 :     if (schema->format[0] == '+' && schema->format[1] == 'v')
    1280             :     {
    1281           0 :         if (schema->format[2] == 'l')
    1282             :         {
    1283           0 :             osErrorMsg = "ListView not supported";
    1284           0 :             return false;
    1285             :         }
    1286           0 :         else if (schema->format[2] == 'L')
    1287             :         {
    1288           0 :             osErrorMsg = "LargeListView not supported";
    1289           0 :             return false;
    1290             :         }
    1291             :     }
    1292        2136 :     for (int64_t i = 0; i < schema->n_children; ++i)
    1293             :     {
    1294        1065 :         if (!IsArrowSchemaSupported(schema->children[i], papszOptions,
    1295             :                                     osErrorMsg))
    1296             :         {
    1297           3 :             return false;
    1298             :         }
    1299             :     }
    1300        1071 :     return true;
    1301             : }
    1302             : #endif
    1303             : 
    1304             : /************************************************************************/
    1305             : /*                            SetMetadata()                             */
    1306             : /************************************************************************/
    1307             : 
    1308           7 : CPLErr OGRParquetWriterLayer::SetMetadata(char **papszMetadata,
    1309             :                                           const char *pszDomain)
    1310             : {
    1311           7 :     if (!pszDomain || !EQUAL(pszDomain, "SHAPEFILE"))
    1312             :     {
    1313           5 :         return OGRLayer::SetMetadata(papszMetadata, pszDomain);
    1314             :     }
    1315           2 :     return CE_None;
    1316             : }
    1317             : 
    1318             : /************************************************************************/
    1319             : /*                             GetDataset()                             */
    1320             : /************************************************************************/
    1321             : 
    1322          23 : GDALDataset *OGRParquetWriterLayer::GetDataset()
    1323             : {
    1324          23 :     return m_poDataset;
    1325             : }

Generated by: LCOV version 1.14