LCOV - code coverage report
Current view: top level - ogr/ogrsf_frmts/parquet - ogrparquetwriterlayer.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 513 590 86.9 %
Date: 2024-05-14 23:54:21 Functions: 28 28 100.0 %

          Line data    Source code
       1             : /******************************************************************************
       2             :  *
       3             :  * Project:  Parquet Translator
       4             :  * Purpose:  Implements OGRParquetDriver.
       5             :  * Author:   Even Rouault, <even.rouault at spatialys.com>
       6             :  *
       7             :  ******************************************************************************
       8             :  * Copyright (c) 2022, Planet Labs
       9             :  *
      10             :  * Permission is hereby granted, free of charge, to any person obtaining a
      11             :  * copy of this software and associated documentation files (the "Software"),
      12             :  * to deal in the Software without restriction, including without limitation
      13             :  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      14             :  * and/or sell copies of the Software, and to permit persons to whom the
      15             :  * Software is furnished to do so, subject to the following conditions:
      16             :  *
      17             :  * The above copyright notice and this permission notice shall be included
      18             :  * in all copies or substantial portions of the Software.
      19             :  *
      20             :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
      21             :  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
      22             :  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
      23             :  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
      24             :  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
      25             :  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
      26             :  * DEALINGS IN THE SOFTWARE.
      27             :  ****************************************************************************/
      28             : 
      29             : #undef DO_NOT_DEFINE_GDAL_DATE_NAME
      30             : #include "gdal_version_full/gdal_version.h"
      31             : 
      32             : #include "ogr_parquet.h"
      33             : 
      34             : #include "../arrow_common/ograrrowwriterlayer.hpp"
      35             : 
      36             : #include "ogr_wkb.h"
      37             : 
      38             : #include <utility>
      39             : 
      40             : /************************************************************************/
      41             : /*                      OGRParquetWriterLayer()                         */
      42             : /************************************************************************/
      43             : 
      44         200 : OGRParquetWriterLayer::OGRParquetWriterLayer(
      45             :     OGRParquetWriterDataset *poDataset, arrow::MemoryPool *poMemoryPool,
      46             :     const std::shared_ptr<arrow::io::OutputStream> &poOutputStream,
      47         200 :     const char *pszLayerName)
      48             :     : OGRArrowWriterLayer(poMemoryPool, poOutputStream, pszLayerName),
      49         200 :       m_poDataset(poDataset)
      50             : {
      51         200 :     m_bWriteFieldArrowExtensionName = CPLTestBool(
      52             :         CPLGetConfigOption("OGR_PARQUET_WRITE_ARROW_EXTENSION_NAME", "NO"));
      53         200 : }
      54             : 
      55             : /************************************************************************/
      56             : /*                                Close()                               */
      57             : /************************************************************************/
      58             : 
      59         197 : bool OGRParquetWriterLayer::Close()
      60             : {
      61         197 :     if (m_poTmpGPKGLayer)
      62             :     {
      63           2 :         if (!CopyTmpGpkgLayerToFinalFile())
      64           0 :             return false;
      65             :     }
      66             : 
      67         197 :     if (m_bInitializationOK)
      68             :     {
      69         197 :         if (!FinalizeWriting())
      70           0 :             return false;
      71             :     }
      72             : 
      73         197 :     return true;
      74             : }
      75             : 
      76             : /************************************************************************/
      77             : /*                     CopyTmpGpkgLayerToFinalFile()                    */
      78             : /************************************************************************/
      79             : 
      80           2 : bool OGRParquetWriterLayer::CopyTmpGpkgLayerToFinalFile()
      81             : {
      82           2 :     if (!m_poTmpGPKGLayer)
      83             :     {
      84           0 :         return true;
      85             :     }
      86             : 
      87           2 :     CPLDebug("PARQUET", "CopyTmpGpkgLayerToFinalFile(): start...");
      88             : 
      89           2 :     VSIUnlink(m_poTmpGPKG->GetDescription());
      90             : 
      91           4 :     OGRFeature oFeat(m_poFeatureDefn);
      92             : 
      93             :     // Interval in terms of features between 2 debug progress report messages
      94           2 :     constexpr int PROGRESS_FC_INTERVAL = 100 * 1000;
      95             : 
      96             :     // First, write features without geometries
      97             :     {
      98           2 :         auto poTmpLayer = std::unique_ptr<OGRLayer>(m_poTmpGPKG->ExecuteSQL(
      99             :             "SELECT serialized_feature FROM tmp WHERE fid NOT IN (SELECT id "
     100             :             "FROM rtree_tmp_geom)",
     101           2 :             nullptr, nullptr));
     102           2 :         if (!poTmpLayer)
     103           0 :             return false;
     104        1004 :         for (const auto &poSrcFeature : poTmpLayer.get())
     105             :         {
     106        1002 :             int nBytesFeature = 0;
     107             :             const GByte *pabyFeatureData =
     108        1002 :                 poSrcFeature->GetFieldAsBinary(0, &nBytesFeature);
     109        1002 :             if (!oFeat.DeserializeFromBinary(pabyFeatureData, nBytesFeature))
     110             :             {
     111           0 :                 CPLError(CE_Failure, CPLE_AppDefined,
     112             :                          "Cannot deserialize feature");
     113           0 :                 return false;
     114             :             }
     115        1002 :             if (OGRArrowWriterLayer::ICreateFeature(&oFeat) != OGRERR_NONE)
     116             :             {
     117           0 :                 return false;
     118             :             }
     119             : 
     120        1002 :             if ((m_nFeatureCount % PROGRESS_FC_INTERVAL) == 0)
     121             :             {
     122           0 :                 CPLDebugProgress(
     123             :                     "PARQUET",
     124             :                     "CopyTmpGpkgLayerToFinalFile(): %.02f%% progress",
     125           0 :                     100.0 * double(m_nFeatureCount) /
     126           0 :                         double(m_nTmpFeatureCount));
     127             :             }
     128             :         }
     129             : 
     130           2 :         if (!FlushFeatures())
     131             :         {
     132           0 :             return false;
     133             :         }
     134             :     }
     135             : 
     136             :     // Now walk through the GPKG RTree for features with geometries
     137             :     // Cf https://github.com/sqlite/sqlite/blob/master/ext/rtree/rtree.c
     138             :     // for the description of the content of the rtree _node table
     139           4 :     std::vector<std::pair<int64_t, int>> aNodeNoDepthPair;
     140           2 :     int nTreeDepth = 0;
     141             :     // Queue the root node
     142             :     aNodeNoDepthPair.emplace_back(
     143           2 :         std::make_pair(/* nodeNo = */ 1, /* depth = */ 0));
     144           2 :     int nCountWrittenFeaturesSinceLastFlush = 0;
     145          50 :     while (!aNodeNoDepthPair.empty())
     146             :     {
     147          48 :         const auto &oLastPair = aNodeNoDepthPair.back();
     148          48 :         const int64_t nNodeNo = oLastPair.first;
     149          48 :         const int nCurDepth = oLastPair.second;
     150             :         //CPLDebug("PARQUET", "Reading nodeNode=%d, curDepth=%d", int(nNodeNo), nCurDepth);
     151          48 :         aNodeNoDepthPair.pop_back();
     152             : 
     153          48 :         auto poRTreeLayer = std::unique_ptr<OGRLayer>(m_poTmpGPKG->ExecuteSQL(
     154             :             CPLSPrintf("SELECT data FROM rtree_tmp_geom_node WHERE nodeno "
     155             :                        "= " CPL_FRMT_GIB,
     156             :                        static_cast<GIntBig>(nNodeNo)),
     157          48 :             nullptr, nullptr));
     158          48 :         if (!poRTreeLayer)
     159             :         {
     160           0 :             CPLError(CE_Failure, CPLE_AppDefined,
     161             :                      "Cannot read node " CPL_FRMT_GIB,
     162             :                      static_cast<GIntBig>(nNodeNo));
     163           0 :             return false;
     164             :         }
     165             :         const auto poRTreeFeature =
     166          48 :             std::unique_ptr<const OGRFeature>(poRTreeLayer->GetNextFeature());
     167          48 :         if (!poRTreeFeature)
     168             :         {
     169           0 :             CPLError(CE_Failure, CPLE_AppDefined,
     170             :                      "Cannot read node " CPL_FRMT_GIB,
     171             :                      static_cast<GIntBig>(nNodeNo));
     172           0 :             return false;
     173             :         }
     174             : 
     175          48 :         int nNodeBytes = 0;
     176             :         const GByte *pabyNodeData =
     177          48 :             poRTreeFeature->GetFieldAsBinary(0, &nNodeBytes);
     178          48 :         constexpr int BLOB_HEADER_SIZE = 4;
     179          48 :         if (nNodeBytes < BLOB_HEADER_SIZE)
     180             :         {
     181           0 :             CPLError(CE_Failure, CPLE_AppDefined,
     182             :                      "Not enough bytes when reading node " CPL_FRMT_GIB,
     183             :                      static_cast<GIntBig>(nNodeNo));
     184           0 :             return false;
     185             :         }
     186          48 :         if (nNodeNo == 1)
     187             :         {
     188             :             // Get the RTree depth from the root node
     189           2 :             nTreeDepth = (pabyNodeData[0] << 8) | pabyNodeData[1];
     190             :             //CPLDebug("PARQUET", "nTreeDepth = %d", nTreeDepth);
     191             :         }
     192             : 
     193          48 :         const int nCellCount = (pabyNodeData[2] << 8) | pabyNodeData[3];
     194          48 :         constexpr int SIZEOF_CELL = 24;  // int64_t + 4 float
     195          48 :         if (nNodeBytes < BLOB_HEADER_SIZE + SIZEOF_CELL * nCellCount)
     196             :         {
     197           0 :             CPLError(CE_Failure, CPLE_AppDefined,
     198             :                      "Not enough bytes when reading node " CPL_FRMT_GIB,
     199             :                      static_cast<GIntBig>(nNodeNo));
     200           0 :             return false;
     201             :         }
     202             : 
     203          48 :         size_t nOffset = BLOB_HEADER_SIZE;
     204          48 :         if (nCurDepth == nTreeDepth)
     205             :         {
     206             :             // Leaf node: it references feature IDs.
     207             : 
     208             :             // If we are about to go above m_nRowGroupSize, flush past
     209             :             // features now, to improve the spatial compacity of the row group.
     210          46 :             if (m_nRowGroupSize > nCellCount &&
     211          46 :                 nCountWrittenFeaturesSinceLastFlush + nCellCount >
     212          46 :                     m_nRowGroupSize)
     213             :             {
     214          14 :                 nCountWrittenFeaturesSinceLastFlush = 0;
     215          14 :                 if (!FlushFeatures())
     216             :                 {
     217           0 :                     return false;
     218             :                 }
     219             :             }
     220             : 
     221             :             // nCellCount shouldn't be over 51 normally, but even 65535
     222             :             // would be fine...
     223             :             // coverity[tainted_data]
     224        1248 :             for (int i = 0; i < nCellCount; ++i)
     225             :             {
     226             :                 int64_t nFID;
     227        1202 :                 memcpy(&nFID, pabyNodeData + nOffset, sizeof(int64_t));
     228        1202 :                 CPL_MSBPTR64(&nFID);
     229             : 
     230             :                 const auto poSrcFeature = std::unique_ptr<const OGRFeature>(
     231        1202 :                     m_poTmpGPKGLayer->GetFeature(nFID));
     232        1202 :                 if (!poSrcFeature)
     233             :                 {
     234           0 :                     CPLError(CE_Failure, CPLE_AppDefined,
     235             :                              "Cannot get feature " CPL_FRMT_GIB,
     236             :                              static_cast<GIntBig>(nFID));
     237           0 :                     return false;
     238             :                 }
     239             : 
     240        1202 :                 int nBytesFeature = 0;
     241             :                 const GByte *pabyFeatureData =
     242        1202 :                     poSrcFeature->GetFieldAsBinary(0, &nBytesFeature);
     243        1202 :                 if (!oFeat.DeserializeFromBinary(pabyFeatureData,
     244             :                                                  nBytesFeature))
     245             :                 {
     246           0 :                     CPLError(CE_Failure, CPLE_AppDefined,
     247             :                              "Cannot deserialize feature");
     248           0 :                     return false;
     249             :                 }
     250        1202 :                 if (OGRArrowWriterLayer::ICreateFeature(&oFeat) != OGRERR_NONE)
     251             :                 {
     252           0 :                     return false;
     253             :                 }
     254             : 
     255        1202 :                 nOffset += SIZEOF_CELL;
     256             : 
     257        1202 :                 ++nCountWrittenFeaturesSinceLastFlush;
     258             : 
     259        1202 :                 if ((m_nFeatureCount % PROGRESS_FC_INTERVAL) == 0 ||
     260        1202 :                     m_nFeatureCount == m_nTmpFeatureCount / 2)
     261             :                 {
     262           2 :                     CPLDebugProgress(
     263             :                         "PARQUET",
     264             :                         "CopyTmpGpkgLayerToFinalFile(): %.02f%% progress",
     265           2 :                         100.0 * double(m_nFeatureCount) /
     266           2 :                             double(m_nTmpFeatureCount));
     267             :                 }
     268             :             }
     269             :         }
     270             :         else
     271             :         {
     272             :             // Non-leaf node: it references child nodes.
     273             : 
     274             :             // nCellCount shouldn't be over 51 normally, but even 65535
     275             :             // would be fine...
     276             :             // coverity[tainted_data]
     277          48 :             for (int i = 0; i < nCellCount; ++i)
     278             :             {
     279             :                 int64_t nNode;
     280          46 :                 memcpy(&nNode, pabyNodeData + nOffset, sizeof(int64_t));
     281          46 :                 CPL_MSBPTR64(&nNode);
     282             :                 aNodeNoDepthPair.emplace_back(
     283          46 :                     std::make_pair(nNode, nCurDepth + 1));
     284          46 :                 nOffset += SIZEOF_CELL;
     285             :             }
     286             :         }
     287             :     }
     288             : 
     289           2 :     CPLDebug("PARQUET",
     290             :              "CopyTmpGpkgLayerToFinalFile(): 100%%, successfully finished");
     291           2 :     return true;
     292             : }
     293             : 
     294             : /************************************************************************/
     295             : /*                       IsSupportedGeometryType()                      */
     296             : /************************************************************************/
     297             : 
     298         204 : bool OGRParquetWriterLayer::IsSupportedGeometryType(
     299             :     OGRwkbGeometryType eGType) const
     300             : {
     301         204 :     const auto eFlattenType = wkbFlatten(eGType);
     302         204 :     if (!OGR_GT_HasM(eGType) && eFlattenType <= wkbGeometryCollection)
     303             :     {
     304         203 :         return true;
     305             :     }
     306             : 
     307             :     const auto osConfigOptionName =
     308           3 :         "OGR_" + GetDriverUCName() + "_ALLOW_ALL_DIMS";
     309           1 :     if (CPLTestBool(CPLGetConfigOption(osConfigOptionName.c_str(), "NO")))
     310             :     {
     311           0 :         return true;
     312             :     }
     313             : 
     314           1 :     CPLError(CE_Failure, CPLE_NotSupported,
     315             :              "Only 2D and Z geometry types are supported (unless the "
     316             :              "%s configuration option is set to YES)",
     317             :              osConfigOptionName.c_str());
     318           1 :     return false;
     319             : }
     320             : 
     321             : /************************************************************************/
     322             : /*                           SetOptions()                               */
     323             : /************************************************************************/
     324             : 
     325         200 : bool OGRParquetWriterLayer::SetOptions(CSLConstList papszOptions,
     326             :                                        const OGRSpatialReference *poSpatialRef,
     327             :                                        OGRwkbGeometryType eGType)
     328             : {
     329         200 :     m_bWriteBBoxStruct = CPLTestBool(CSLFetchNameValueDef(
     330             :         papszOptions, "WRITE_COVERING_BBOX",
     331             :         CPLGetConfigOption("OGR_PARQUET_WRITE_COVERING_BBOX", "YES")));
     332             : 
     333         200 :     if (CPLTestBool(CSLFetchNameValueDef(papszOptions, "SORT_BY_BBOX", "NO")))
     334             :     {
     335           6 :         const std::string osTmpGPKG(std::string(m_poDataset->GetDescription()) +
     336           3 :                                     ".tmp.gpkg");
     337           3 :         auto poGPKGDrv = GetGDALDriverManager()->GetDriverByName("GPKG");
     338           3 :         if (!poGPKGDrv)
     339             :         {
     340           1 :             CPLError(
     341             :                 CE_Failure, CPLE_AppDefined,
     342             :                 "Driver GPKG required for SORT_BY_BBOX layer creation option");
     343           1 :             return false;
     344             :         }
     345           2 :         m_poTmpGPKG.reset(poGPKGDrv->Create(osTmpGPKG.c_str(), 0, 0, 0,
     346             :                                             GDT_Unknown, nullptr));
     347           2 :         if (!m_poTmpGPKG)
     348           0 :             return false;
     349           2 :         m_poTmpGPKG->MarkSuppressOnClose();
     350           2 :         m_poTmpGPKGLayer = m_poTmpGPKG->CreateLayer("tmp");
     351           2 :         if (!m_poTmpGPKGLayer)
     352           0 :             return false;
     353             :         // Serialized feature
     354           2 :         m_poTmpGPKGLayer->CreateField(
     355           2 :             std::make_unique<OGRFieldDefn>("serialized_feature", OFTBinary)
     356           2 :                 .get());
     357           2 :         CPL_IGNORE_RET_VAL(m_poTmpGPKGLayer->StartTransaction());
     358             :     }
     359             : 
     360             :     const char *pszGeomEncoding =
     361         199 :         CSLFetchNameValue(papszOptions, "GEOMETRY_ENCODING");
     362         199 :     m_eGeomEncoding = OGRArrowGeomEncoding::WKB;
     363         199 :     if (pszGeomEncoding)
     364             :     {
     365          92 :         if (EQUAL(pszGeomEncoding, "WKB"))
     366           0 :             m_eGeomEncoding = OGRArrowGeomEncoding::WKB;
     367          92 :         else if (EQUAL(pszGeomEncoding, "WKT"))
     368           8 :             m_eGeomEncoding = OGRArrowGeomEncoding::WKT;
     369          84 :         else if (EQUAL(pszGeomEncoding, "GEOARROW_INTERLEAVED"))
     370             :         {
     371             :             static bool bHasWarned = false;
     372          28 :             if (!bHasWarned)
     373             :             {
     374           1 :                 bHasWarned = true;
     375           1 :                 CPLError(
     376             :                     CE_Warning, CPLE_AppDefined,
     377             :                     "Use of GEOMETRY_ENCODING=GEOARROW_INTERLEAVED is not "
     378             :                     "recommended. "
     379             :                     "GeoParquet 1.1 uses GEOMETRY_ENCODING=GEOARROW (struct) "
     380             :                     "instead.");
     381             :             }
     382          28 :             m_eGeomEncoding = OGRArrowGeomEncoding::GEOARROW_FSL_GENERIC;
     383             :         }
     384          56 :         else if (EQUAL(pszGeomEncoding, "GEOARROW") ||
     385           0 :                  EQUAL(pszGeomEncoding, "GEOARROW_STRUCT"))
     386          56 :             m_eGeomEncoding = OGRArrowGeomEncoding::GEOARROW_STRUCT_GENERIC;
     387             :         else
     388             :         {
     389           0 :             CPLError(CE_Failure, CPLE_NotSupported,
     390             :                      "Unsupported GEOMETRY_ENCODING = %s", pszGeomEncoding);
     391           0 :             return false;
     392             :         }
     393             :     }
     394             : 
     395             :     const char *pszCoordPrecision =
     396         199 :         CSLFetchNameValue(papszOptions, "COORDINATE_PRECISION");
     397         199 :     if (pszCoordPrecision)
     398           0 :         m_nWKTCoordinatePrecision = atoi(pszCoordPrecision);
     399             : 
     400         199 :     m_bForceCounterClockwiseOrientation =
     401         199 :         EQUAL(CSLFetchNameValueDef(papszOptions, "POLYGON_ORIENTATION",
     402             :                                    "COUNTERCLOCKWISE"),
     403             :               "COUNTERCLOCKWISE");
     404             : 
     405         199 :     if (eGType != wkbNone)
     406             :     {
     407         178 :         if (!IsSupportedGeometryType(eGType))
     408             :         {
     409           1 :             return false;
     410             :         }
     411             : 
     412         177 :         m_poFeatureDefn->SetGeomType(eGType);
     413         177 :         auto eGeomEncoding = m_eGeomEncoding;
     414         177 :         if (eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_FSL_GENERIC ||
     415         149 :             eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_STRUCT_GENERIC)
     416             :         {
     417          84 :             const auto eEncodingType = eGeomEncoding;
     418          84 :             eGeomEncoding = GetPreciseArrowGeomEncoding(eEncodingType, eGType);
     419          84 :             if (eGeomEncoding == eEncodingType)
     420           0 :                 return false;
     421             :         }
     422         177 :         m_aeGeomEncoding.push_back(eGeomEncoding);
     423         177 :         m_poFeatureDefn->GetGeomFieldDefn(0)->SetName(
     424             :             CSLFetchNameValueDef(papszOptions, "GEOMETRY_NAME", "geometry"));
     425         177 :         if (poSpatialRef)
     426             :         {
     427          20 :             auto poSRS = poSpatialRef->Clone();
     428          20 :             m_poFeatureDefn->GetGeomFieldDefn(0)->SetSpatialRef(poSRS);
     429          20 :             poSRS->Release();
     430             :         }
     431             :     }
     432             : 
     433         198 :     m_osFIDColumn = CSLFetchNameValueDef(papszOptions, "FID", "");
     434             : 
     435         198 :     const char *pszCompression = CSLFetchNameValue(papszOptions, "COMPRESSION");
     436         198 :     if (pszCompression == nullptr)
     437             :     {
     438         582 :         auto oResult = arrow::util::Codec::GetCompressionType("snappy");
     439         194 :         if (oResult.ok() && arrow::util::Codec::IsAvailable(*oResult))
     440             :         {
     441         194 :             pszCompression = "SNAPPY";
     442             :         }
     443             :         else
     444             :         {
     445           0 :             pszCompression = "NONE";
     446             :         }
     447             :     }
     448             : 
     449         198 :     if (EQUAL(pszCompression, "NONE"))
     450           0 :         pszCompression = "UNCOMPRESSED";
     451             :     auto oResult = arrow::util::Codec::GetCompressionType(
     452         396 :         CPLString(pszCompression).tolower());
     453         198 :     if (!oResult.ok())
     454             :     {
     455           1 :         CPLError(CE_Failure, CPLE_NotSupported,
     456             :                  "Unrecognized compression method: %s", pszCompression);
     457           1 :         return false;
     458             :     }
     459         197 :     m_eCompression = *oResult;
     460         197 :     if (!arrow::util::Codec::IsAvailable(m_eCompression))
     461             :     {
     462           0 :         CPLError(CE_Failure, CPLE_NotSupported,
     463             :                  "Compression method %s is known, but libarrow has not "
     464             :                  "been built with support for it",
     465             :                  pszCompression);
     466           0 :         return false;
     467             :     }
     468             : 
     469         197 :     m_oWriterPropertiesBuilder.compression(m_eCompression);
     470             :     const std::string osCreator =
     471         197 :         CSLFetchNameValueDef(papszOptions, "CREATOR", "");
     472         197 :     if (!osCreator.empty())
     473           1 :         m_oWriterPropertiesBuilder.created_by(osCreator);
     474             :     else
     475         196 :         m_oWriterPropertiesBuilder.created_by("GDAL " GDAL_RELEASE_NAME
     476             :                                               ", using " CREATED_BY_VERSION);
     477             : 
     478             :     // Undocumented option. Not clear it is useful besides unit test purposes
     479         197 :     if (!CPLTestBool(CSLFetchNameValueDef(papszOptions, "STATISTICS", "YES")))
     480           1 :         m_oWriterPropertiesBuilder.disable_statistics();
     481             : 
     482         197 :     if (m_eGeomEncoding == OGRArrowGeomEncoding::WKB && eGType != wkbNone)
     483             :     {
     484          92 :         m_oWriterPropertiesBuilder.disable_statistics(
     485         276 :             parquet::schema::ColumnPath::FromDotString(
     486          92 :                 m_poFeatureDefn->GetGeomFieldDefn(0)->GetNameRef()));
     487             :     }
     488             : 
     489             :     const char *pszRowGroupSize =
     490         197 :         CSLFetchNameValue(papszOptions, "ROW_GROUP_SIZE");
     491         197 :     if (pszRowGroupSize)
     492             :     {
     493           5 :         auto nRowGroupSize = static_cast<int64_t>(atoll(pszRowGroupSize));
     494           5 :         if (nRowGroupSize > 0)
     495             :         {
     496           5 :             if (nRowGroupSize > INT_MAX)
     497           0 :                 nRowGroupSize = INT_MAX;
     498           5 :             m_nRowGroupSize = nRowGroupSize;
     499             :         }
     500             :     }
     501             : 
     502         197 :     m_bEdgesSpherical = EQUAL(
     503             :         CSLFetchNameValueDef(papszOptions, "EDGES", "PLANAR"), "SPHERICAL");
     504             : 
     505         197 :     m_bInitializationOK = true;
     506         197 :     return true;
     507             : }
     508             : 
     509             : /************************************************************************/
     510             : /*                         CloseFileWriter()                            */
     511             : /************************************************************************/
     512             : 
     513         197 : bool OGRParquetWriterLayer::CloseFileWriter()
     514             : {
     515         394 :     auto status = m_poFileWriter->Close();
     516         197 :     if (!status.ok())
     517             :     {
     518           0 :         CPLError(CE_Failure, CPLE_AppDefined,
     519             :                  "FileWriter::Close() failed with %s",
     520           0 :                  status.message().c_str());
     521             :     }
     522         394 :     return status.ok();
     523             : }
     524             : 
     525             : /************************************************************************/
     526             : /*                            IdentifyCRS()                             */
     527             : /************************************************************************/
     528             : 
     529          19 : static OGRSpatialReference IdentifyCRS(const OGRSpatialReference *poSRS)
     530             : {
     531          19 :     OGRSpatialReference oSRSIdentified(*poSRS);
     532             : 
     533          19 :     if (poSRS->GetAuthorityName(nullptr) == nullptr)
     534             :     {
     535             :         // Try to find a registered CRS that matches the input one
     536           4 :         int nEntries = 0;
     537           4 :         int *panConfidence = nullptr;
     538             :         OGRSpatialReferenceH *pahSRS =
     539           4 :             poSRS->FindMatches(nullptr, &nEntries, &panConfidence);
     540             : 
     541             :         // If there are several matches >= 90%, take the only one
     542             :         // that is EPSG
     543           4 :         int iOtherAuthority = -1;
     544           4 :         int iEPSG = -1;
     545           4 :         const char *const apszOptions[] = {
     546             :             "IGNORE_DATA_AXIS_TO_SRS_AXIS_MAPPING=YES", nullptr};
     547           4 :         int iConfidenceBestMatch = -1;
     548           6 :         for (int iSRS = 0; iSRS < nEntries; iSRS++)
     549             :         {
     550           4 :             auto poCandidateCRS = OGRSpatialReference::FromHandle(pahSRS[iSRS]);
     551           4 :             if (panConfidence[iSRS] < iConfidenceBestMatch ||
     552           4 :                 panConfidence[iSRS] < 70)
     553             :             {
     554             :                 break;
     555             :             }
     556           3 :             if (poSRS->IsSame(poCandidateCRS, apszOptions))
     557             :             {
     558             :                 const char *pszAuthName =
     559           3 :                     poCandidateCRS->GetAuthorityName(nullptr);
     560           3 :                 if (pszAuthName != nullptr && EQUAL(pszAuthName, "EPSG"))
     561             :                 {
     562           2 :                     iOtherAuthority = -2;
     563           2 :                     if (iEPSG < 0)
     564             :                     {
     565           2 :                         iConfidenceBestMatch = panConfidence[iSRS];
     566           2 :                         iEPSG = iSRS;
     567             :                     }
     568             :                     else
     569             :                     {
     570           0 :                         iEPSG = -1;
     571           0 :                         break;
     572             :                     }
     573             :                 }
     574           1 :                 else if (iEPSG < 0 && pszAuthName != nullptr)
     575             :                 {
     576           1 :                     if (EQUAL(pszAuthName, "OGC"))
     577             :                     {
     578             :                         const char *pszAuthCode =
     579           1 :                             poCandidateCRS->GetAuthorityCode(nullptr);
     580           1 :                         if (pszAuthCode && EQUAL(pszAuthCode, "CRS84"))
     581             :                         {
     582           1 :                             iOtherAuthority = iSRS;
     583           1 :                             break;
     584             :                         }
     585             :                     }
     586           0 :                     else if (iOtherAuthority == -1)
     587             :                     {
     588           0 :                         iConfidenceBestMatch = panConfidence[iSRS];
     589           0 :                         iOtherAuthority = iSRS;
     590             :                     }
     591             :                     else
     592           0 :                         iOtherAuthority = -2;
     593             :                 }
     594             :             }
     595             :         }
     596           4 :         if (iEPSG >= 0)
     597             :         {
     598           2 :             oSRSIdentified = *OGRSpatialReference::FromHandle(pahSRS[iEPSG]);
     599             :         }
     600           2 :         else if (iOtherAuthority >= 0)
     601             :         {
     602             :             oSRSIdentified =
     603           1 :                 *OGRSpatialReference::FromHandle(pahSRS[iOtherAuthority]);
     604             :         }
     605           4 :         OSRFreeSRSArray(pahSRS);
     606           4 :         CPLFree(panConfidence);
     607             :     }
     608             : 
     609          19 :     return oSRSIdentified;
     610             : }
     611             : 
     612             : /************************************************************************/
     613             : /*                      RemoveIDFromMemberOfEnsembles()                 */
     614             : /************************************************************************/
     615             : 
     616         236 : static void RemoveIDFromMemberOfEnsembles(CPLJSONObject &obj)
     617             : {
     618             :     // Remove "id" from members of datum ensembles for compatibility with
     619             :     // older PROJ versions
     620             :     // Cf https://github.com/opengeospatial/geoparquet/discussions/110
     621             :     // and https://github.com/OSGeo/PROJ/pull/3221
     622         236 :     if (obj.GetType() == CPLJSONObject::Type::Object)
     623             :     {
     624         298 :         for (auto &subObj : obj.GetChildren())
     625             :         {
     626         228 :             RemoveIDFromMemberOfEnsembles(subObj);
     627             :         }
     628             :     }
     629         182 :     else if (obj.GetType() == CPLJSONObject::Type::Array &&
     630         182 :              obj.GetName() == "members")
     631             :     {
     632           0 :         for (auto &subObj : obj.ToArray())
     633             :         {
     634           0 :             subObj.Delete("id");
     635             :         }
     636             :     }
     637         236 : }
     638             : 
     639             : /************************************************************************/
     640             : /*                            GetGeoMetadata()                          */
     641             : /************************************************************************/
     642             : 
     643         197 : std::string OGRParquetWriterLayer::GetGeoMetadata() const
     644             : {
     645             :     // Just for unit testing purposes
     646             :     const char *pszGeoMetadata =
     647         197 :         CPLGetConfigOption("OGR_PARQUET_GEO_METADATA", nullptr);
     648         197 :     if (pszGeoMetadata)
     649          16 :         return pszGeoMetadata;
     650             : 
     651         351 :     if (m_poFeatureDefn->GetGeomFieldCount() != 0 &&
     652         170 :         CPLTestBool(CPLGetConfigOption("OGR_PARQUET_WRITE_GEO", "YES")))
     653             :     {
     654         338 :         CPLJSONObject oRoot;
     655         169 :         oRoot.Add("version",
     656         169 :                   m_eGeomEncoding ==
     657             :                           OGRArrowGeomEncoding::GEOARROW_STRUCT_GENERIC
     658             :                       ? "1.1.0"
     659             :                       : "1.0.0");
     660         169 :         oRoot.Add("primary_column",
     661         169 :                   m_poFeatureDefn->GetGeomFieldDefn(0)->GetNameRef());
     662         338 :         CPLJSONObject oColumns;
     663         169 :         oRoot.Add("columns", oColumns);
     664         355 :         for (int i = 0; i < m_poFeatureDefn->GetGeomFieldCount(); ++i)
     665             :         {
     666         186 :             const auto poGeomFieldDefn = m_poFeatureDefn->GetGeomFieldDefn(i);
     667         372 :             CPLJSONObject oColumn;
     668         186 :             oColumns.Add(poGeomFieldDefn->GetNameRef(), oColumn);
     669         186 :             oColumn.Add("encoding",
     670         186 :                         GetGeomEncodingAsString(m_aeGeomEncoding[i], true));
     671             : 
     672         186 :             if (CPLTestBool(CPLGetConfigOption("OGR_PARQUET_WRITE_CRS", "YES")))
     673             :             {
     674         185 :                 const auto poSRS = poGeomFieldDefn->GetSpatialRef();
     675         185 :                 if (poSRS)
     676             :                 {
     677          38 :                     OGRSpatialReference oSRSIdentified(IdentifyCRS(poSRS));
     678             : 
     679             :                     const char *pszAuthName =
     680          19 :                         oSRSIdentified.GetAuthorityName(nullptr);
     681             :                     const char *pszAuthCode =
     682          19 :                         oSRSIdentified.GetAuthorityCode(nullptr);
     683             : 
     684          19 :                     bool bOmitCRS = false;
     685          19 :                     if (pszAuthName != nullptr && pszAuthCode != nullptr &&
     686          18 :                         ((EQUAL(pszAuthName, "EPSG") &&
     687          15 :                           EQUAL(pszAuthCode, "4326")) ||
     688          10 :                          (EQUAL(pszAuthName, "OGC") &&
     689           3 :                           EQUAL(pszAuthCode, "CRS84"))))
     690             :                     {
     691             :                         // To make things less confusing for non-geo-aware
     692             :                         // consumers, omit EPSG:4326 / OGC:CRS84 CRS by default
     693          11 :                         bOmitCRS = CPLTestBool(CPLGetConfigOption(
     694             :                             "OGR_PARQUET_CRS_OMIT_IF_WGS84", "YES"));
     695             :                     }
     696             : 
     697          19 :                     if (bOmitCRS)
     698             :                     {
     699             :                         // do nothing
     700             :                     }
     701           8 :                     else if (EQUAL(CPLGetConfigOption(
     702             :                                        "OGR_PARQUET_CRS_ENCODING", "PROJJSON"),
     703             :                                    "PROJJSON"))
     704             :                     {
     705             :                         // CRS encoded as PROJJSON for GeoParquet >= 0.4.0
     706           8 :                         char *pszPROJJSON = nullptr;
     707           8 :                         oSRSIdentified.exportToPROJJSON(&pszPROJJSON, nullptr);
     708          16 :                         CPLJSONDocument oCRSDoc;
     709           8 :                         CPL_IGNORE_RET_VAL(oCRSDoc.LoadMemory(pszPROJJSON));
     710           8 :                         CPLFree(pszPROJJSON);
     711           8 :                         CPLJSONObject oCRSRoot = oCRSDoc.GetRoot();
     712           8 :                         RemoveIDFromMemberOfEnsembles(oCRSRoot);
     713           8 :                         oColumn.Add("crs", oCRSRoot);
     714             :                     }
     715             :                     else
     716             :                     {
     717             :                         // WKT was used in GeoParquet <= 0.3.0
     718           0 :                         const char *const apszOptions[] = {
     719             :                             "FORMAT=WKT2_2019", "MULTILINE=NO", nullptr};
     720           0 :                         char *pszWKT = nullptr;
     721           0 :                         oSRSIdentified.exportToWkt(&pszWKT, apszOptions);
     722           0 :                         if (pszWKT)
     723           0 :                             oColumn.Add("crs", pszWKT);
     724           0 :                         CPLFree(pszWKT);
     725             :                     }
     726             : 
     727          19 :                     const double dfCoordEpoch = poSRS->GetCoordinateEpoch();
     728          19 :                     if (dfCoordEpoch > 0)
     729           2 :                         oColumn.Add("epoch", dfCoordEpoch);
     730             :                 }
     731             :                 else
     732             :                 {
     733         166 :                     oColumn.AddNull("crs");
     734             :                 }
     735             :             }
     736             : 
     737         186 :             if (m_bEdgesSpherical)
     738             :             {
     739           1 :                 oColumn.Add("edges", "spherical");
     740             :             }
     741             : 
     742         348 :             if (m_aoEnvelopes[i].IsInit() &&
     743         162 :                 CPLTestBool(
     744             :                     CPLGetConfigOption("OGR_PARQUET_WRITE_BBOX", "YES")))
     745             :             {
     746         162 :                 bool bHasZ = false;
     747         307 :                 for (const auto eGeomType : m_oSetWrittenGeometryTypes[i])
     748             :                 {
     749         204 :                     bHasZ = OGR_GT_HasZ(eGeomType);
     750         204 :                     if (bHasZ)
     751          59 :                         break;
     752             :                 }
     753         162 :                 CPLJSONArray oBBOX;
     754         162 :                 oBBOX.Add(m_aoEnvelopes[i].MinX);
     755         162 :                 oBBOX.Add(m_aoEnvelopes[i].MinY);
     756         162 :                 if (bHasZ)
     757          59 :                     oBBOX.Add(m_aoEnvelopes[i].MinZ);
     758         162 :                 oBBOX.Add(m_aoEnvelopes[i].MaxX);
     759         162 :                 oBBOX.Add(m_aoEnvelopes[i].MaxY);
     760         162 :                 if (bHasZ)
     761          59 :                     oBBOX.Add(m_aoEnvelopes[i].MaxZ);
     762         162 :                 oColumn.Add("bbox", oBBOX);
     763             :             }
     764             : 
     765             :             // Bounding box column definition
     766         330 :             if (m_bWriteBBoxStruct &&
     767         144 :                 CPLTestBool(CPLGetConfigOption(
     768             :                     "OGR_PARQUET_WRITE_COVERING_BBOX_IN_METADATA", "YES")))
     769             :             {
     770         288 :                 CPLJSONObject oCovering;
     771         144 :                 oColumn.Add("covering", oCovering);
     772         288 :                 CPLJSONObject oBBOX;
     773         144 :                 oCovering.Add("bbox", oBBOX);
     774             :                 const auto AddComponent =
     775        1728 :                     [this, i, &oBBOX](const char *pszComponent)
     776             :                 {
     777         576 :                     CPLJSONArray oArray;
     778         576 :                     oArray.Add(m_apoFieldsBBOX[i]->name());
     779         576 :                     oArray.Add(pszComponent);
     780         576 :                     oBBOX.Add(pszComponent, oArray);
     781         576 :                 };
     782         144 :                 AddComponent("xmin");
     783         144 :                 AddComponent("ymin");
     784         144 :                 AddComponent("xmax");
     785         144 :                 AddComponent("ymax");
     786             :             }
     787             : 
     788         220 :             const auto GetStringGeometryType = [](OGRwkbGeometryType eType)
     789             :             {
     790         220 :                 const auto eFlattenType = wkbFlatten(eType);
     791         220 :                 std::string osType = "Unknown";
     792         220 :                 if (wkbPoint == eFlattenType)
     793          53 :                     osType = "Point";
     794         167 :                 else if (wkbLineString == eFlattenType)
     795          26 :                     osType = "LineString";
     796         141 :                 else if (wkbPolygon == eFlattenType)
     797          40 :                     osType = "Polygon";
     798         101 :                 else if (wkbMultiPoint == eFlattenType)
     799          18 :                     osType = "MultiPoint";
     800          83 :                 else if (wkbMultiLineString == eFlattenType)
     801          21 :                     osType = "MultiLineString";
     802          62 :                 else if (wkbMultiPolygon == eFlattenType)
     803          57 :                     osType = "MultiPolygon";
     804           5 :                 else if (wkbGeometryCollection == eFlattenType)
     805           5 :                     osType = "GeometryCollection";
     806         220 :                 if (osType != "Unknown")
     807             :                 {
     808             :                     // M and ZM not supported officially currently, but it
     809             :                     // doesn't hurt to anticipate
     810         220 :                     if (OGR_GT_HasZ(eType) && OGR_GT_HasM(eType))
     811           8 :                         osType += " ZM";
     812         212 :                     else if (OGR_GT_HasZ(eType))
     813          67 :                         osType += " Z";
     814         145 :                     else if (OGR_GT_HasM(eType))
     815           8 :                         osType += " M";
     816             :                 }
     817         220 :                 return osType;
     818             :             };
     819             : 
     820         186 :             if (m_bForceCounterClockwiseOrientation)
     821         185 :                 oColumn.Add("orientation", "counterclockwise");
     822             : 
     823         186 :             CPLJSONArray oArray;
     824         406 :             for (const auto eType : m_oSetWrittenGeometryTypes[i])
     825             :             {
     826         220 :                 oArray.Add(GetStringGeometryType(eType));
     827             :             }
     828         186 :             oColumn.Add("geometry_types", oArray);
     829             :         }
     830             : 
     831         169 :         return oRoot.Format(CPLJSONObject::PrettyFormat::Plain);
     832             :     }
     833          12 :     return std::string();
     834             : }
     835             : 
     836             : /************************************************************************/
     837             : /*               PerformStepsBeforeFinalFlushGroup()                    */
     838             : /************************************************************************/
     839             : 
     840         197 : void OGRParquetWriterLayer::PerformStepsBeforeFinalFlushGroup()
     841             : {
     842         197 :     if (m_poKeyValueMetadata)
     843             :     {
     844         394 :         const std::string osGeoMetadata = GetGeoMetadata();
     845         394 :         auto poTmpSchema = m_poSchema;
     846         197 :         if (!osGeoMetadata.empty())
     847             :         {
     848             :             // HACK: it would be good for Arrow to provide a clean way to alter
     849             :             // key value metadata before finalizing.
     850             :             // We need to write metadata at end to write the bounding box.
     851         185 :             const_cast<arrow::KeyValueMetadata *>(m_poKeyValueMetadata.get())
     852         185 :                 ->Append("geo", osGeoMetadata);
     853             : 
     854         185 :             auto kvMetadata = poTmpSchema->metadata()
     855           8 :                                   ? poTmpSchema->metadata()->Copy()
     856         193 :                                   : std::make_shared<arrow::KeyValueMetadata>();
     857         185 :             kvMetadata->Append("geo", osGeoMetadata);
     858         185 :             poTmpSchema = poTmpSchema->WithMetadata(kvMetadata);
     859             :         }
     860             : 
     861         197 :         if (CPLTestBool(
     862             :                 CPLGetConfigOption("OGR_PARQUET_WRITE_ARROW_SCHEMA", "YES")))
     863             :         {
     864             :             auto status =
     865         394 :                 ::arrow::ipc::SerializeSchema(*poTmpSchema, m_poMemoryPool);
     866         197 :             if (status.ok())
     867             :             {
     868             :                 // The serialized schema is not UTF-8, which is required for
     869             :                 // Thrift
     870         394 :                 const std::string schema_as_string = (*status)->ToString();
     871             :                 const std::string schema_base64 =
     872         197 :                     ::arrow::util::base64_encode(schema_as_string);
     873         197 :                 static const std::string kArrowSchemaKey = "ARROW:schema";
     874             :                 const_cast<arrow::KeyValueMetadata *>(
     875         197 :                     m_poKeyValueMetadata.get())
     876         197 :                     ->Append(kArrowSchemaKey, schema_base64);
     877             :             }
     878             :         }
     879             : 
     880             :         // Put GDAL metadata into a gdal:metadata domain
     881         394 :         CPLJSONObject oMultiMetadata;
     882         197 :         bool bHasMultiMetadata = false;
     883         200 :         auto &l_oMDMD = oMDMD.GetDomainList() && *(oMDMD.GetDomainList())
     884         200 :                             ? oMDMD
     885         194 :                             : m_poDataset->GetMultiDomainMetadata();
     886         202 :         for (CSLConstList papszDomainIter = l_oMDMD.GetDomainList();
     887         202 :              papszDomainIter && *papszDomainIter; ++papszDomainIter)
     888             :         {
     889           5 :             const char *pszDomain = *papszDomainIter;
     890           5 :             CSLConstList papszMD = l_oMDMD.GetMetadata(pszDomain);
     891           5 :             if (STARTS_WITH(pszDomain, "json:") && papszMD && papszMD[0])
     892             :             {
     893           1 :                 CPLJSONDocument oDoc;
     894           1 :                 if (oDoc.LoadMemory(papszMD[0]))
     895             :                 {
     896           1 :                     bHasMultiMetadata = true;
     897           1 :                     oMultiMetadata.Add(pszDomain, oDoc.GetRoot());
     898           1 :                     continue;
     899           0 :                 }
     900             :             }
     901           4 :             else if (STARTS_WITH(pszDomain, "xml:") && papszMD && papszMD[0])
     902             :             {
     903           1 :                 bHasMultiMetadata = true;
     904           1 :                 oMultiMetadata.Add(pszDomain, papszMD[0]);
     905           1 :                 continue;
     906             :             }
     907           6 :             CPLJSONObject oMetadata;
     908           3 :             bool bHasMetadata = false;
     909           6 :             for (CSLConstList papszMDIter = papszMD;
     910           6 :                  papszMDIter && *papszMDIter; ++papszMDIter)
     911             :             {
     912           3 :                 char *pszKey = nullptr;
     913           3 :                 const char *pszValue = CPLParseNameValue(*papszMDIter, &pszKey);
     914           3 :                 if (pszKey && pszValue)
     915             :                 {
     916           3 :                     bHasMetadata = true;
     917           3 :                     bHasMultiMetadata = true;
     918           3 :                     oMetadata.Add(pszKey, pszValue);
     919             :                 }
     920           3 :                 CPLFree(pszKey);
     921             :             }
     922           3 :             if (bHasMetadata)
     923           3 :                 oMultiMetadata.Add(pszDomain, oMetadata);
     924             :         }
     925         197 :         if (bHasMultiMetadata)
     926             :         {
     927           3 :             const_cast<arrow::KeyValueMetadata *>(m_poKeyValueMetadata.get())
     928           3 :                 ->Append(
     929             :                     "gdal:metadata",
     930           6 :                     oMultiMetadata.Format(CPLJSONObject::PrettyFormat::Plain));
     931             :         }
     932             :     }
     933         197 : }
     934             : 
     935             : /************************************************************************/
     936             : /*                                 Open()                               */
     937             : /************************************************************************/
     938             : 
     939             : // Same as parquet::arrow::FileWriter::Open(), except we also
     940             : // return KeyValueMetadata
     941             : static arrow::Status
     942         197 : Open(const ::arrow::Schema &schema, ::arrow::MemoryPool *pool,
     943             :      std::shared_ptr<::arrow::io::OutputStream> sink,
     944             :      std::shared_ptr<parquet::WriterProperties> properties,
     945             :      std::shared_ptr<parquet::ArrowWriterProperties> arrow_properties,
     946             :      std::unique_ptr<parquet::arrow::FileWriter> *writer,
     947             :      std::shared_ptr<const arrow::KeyValueMetadata> *outMetadata)
     948             : {
     949         197 :     std::shared_ptr<parquet::SchemaDescriptor> parquet_schema;
     950         394 :     RETURN_NOT_OK(parquet::arrow::ToParquetSchema(
     951             :         &schema, *properties, *arrow_properties, &parquet_schema));
     952             : 
     953             :     auto schema_node = std::static_pointer_cast<parquet::schema::GroupNode>(
     954         394 :         parquet_schema->schema_root());
     955             : 
     956         197 :     auto metadata = schema.metadata()
     957          13 :                         ? schema.metadata()->Copy()
     958         407 :                         : std::make_shared<arrow::KeyValueMetadata>();
     959         197 :     *outMetadata = metadata;
     960             : 
     961         197 :     std::unique_ptr<parquet::ParquetFileWriter> base_writer;
     962         197 :     PARQUET_CATCH_NOT_OK(base_writer = parquet::ParquetFileWriter::Open(
     963             :                              std::move(sink), std::move(schema_node),
     964             :                              std::move(properties), metadata));
     965             : 
     966         197 :     auto schema_ptr = std::make_shared<::arrow::Schema>(schema);
     967             :     return parquet::arrow::FileWriter::Make(
     968         394 :         pool, std::move(base_writer), std::move(schema_ptr),
     969         591 :         std::move(arrow_properties), writer);
     970             : }
     971             : 
     972             : /************************************************************************/
     973             : /*                          CreateSchema()                              */
     974             : /************************************************************************/
     975             : 
     976         197 : void OGRParquetWriterLayer::CreateSchema()
     977             : {
     978         197 :     CreateSchemaCommon();
     979         197 : }
     980             : 
     981             : /************************************************************************/
     982             : /*                          CreateGeomField()                           */
     983             : /************************************************************************/
     984             : 
     985          27 : OGRErr OGRParquetWriterLayer::CreateGeomField(const OGRGeomFieldDefn *poField,
     986             :                                               int bApproxOK)
     987             : {
     988          27 :     OGRErr eErr = OGRArrowWriterLayer::CreateGeomField(poField, bApproxOK);
     989          53 :     if (eErr == OGRERR_NONE &&
     990          26 :         m_aeGeomEncoding.back() == OGRArrowGeomEncoding::WKB)
     991             :     {
     992           2 :         m_oWriterPropertiesBuilder.disable_statistics(
     993           6 :             parquet::schema::ColumnPath::FromDotString(
     994           2 :                 m_poFeatureDefn
     995           2 :                     ->GetGeomFieldDefn(m_poFeatureDefn->GetGeomFieldCount() - 1)
     996             :                     ->GetNameRef()));
     997             :     }
     998          27 :     return eErr;
     999             : }
    1000             : 
    1001             : /************************************************************************/
    1002             : /*                          CreateWriter()                              */
    1003             : /************************************************************************/
    1004             : 
    1005         197 : void OGRParquetWriterLayer::CreateWriter()
    1006             : {
    1007         197 :     CPLAssert(m_poFileWriter == nullptr);
    1008             : 
    1009         197 :     if (m_poSchema == nullptr)
    1010             :     {
    1011          38 :         CreateSchema();
    1012             :     }
    1013             :     else
    1014             :     {
    1015         159 :         FinalizeSchema();
    1016             :     }
    1017             : 
    1018             :     auto arrowWriterProperties =
    1019         197 :         parquet::ArrowWriterProperties::Builder().store_schema()->build();
    1020         591 :     CPL_IGNORE_RET_VAL(Open(*m_poSchema, m_poMemoryPool, m_poOutputStream,
    1021         394 :                             m_oWriterPropertiesBuilder.build(),
    1022         197 :                             std::move(arrowWriterProperties), &m_poFileWriter,
    1023             :                             &m_poKeyValueMetadata));
    1024         197 : }
    1025             : 
    1026             : /************************************************************************/
    1027             : /*                          ICreateFeature()                            */
    1028             : /************************************************************************/
    1029             : 
    1030        2806 : OGRErr OGRParquetWriterLayer::ICreateFeature(OGRFeature *poFeature)
    1031             : {
    1032             :     // If not using SORT_BY_BBOX=YES layer creation option, we can directly
    1033             :     // write features to the final Parquet file
    1034        2806 :     if (!m_poTmpGPKGLayer)
    1035         602 :         return OGRArrowWriterLayer::ICreateFeature(poFeature);
    1036             : 
    1037             :     // SORT_BY_BBOX=YES case: we write for now a serialized version of poFeature
    1038             :     // in a temporary GeoPackage file.
    1039             : 
    1040        2204 :     GIntBig nFID = poFeature->GetFID();
    1041        2204 :     if (!m_osFIDColumn.empty() && nFID == OGRNullFID)
    1042             :     {
    1043        1102 :         nFID = m_nTmpFeatureCount;
    1044        1102 :         poFeature->SetFID(nFID);
    1045             :     }
    1046        2204 :     ++m_nTmpFeatureCount;
    1047             : 
    1048        4408 :     std::vector<GByte> abyBuffer;
    1049             :     // Serialize the source feature as a single array of bytes to preserve it
    1050             :     // fully
    1051        2204 :     if (!poFeature->SerializeToBinary(abyBuffer))
    1052             :     {
    1053           0 :         return OGRERR_FAILURE;
    1054             :     }
    1055             : 
    1056             :     // SQLite3 limitation: a row must fit in slightly less than 1 GB.
    1057        2204 :     constexpr int SOME_MARGIN = 128;
    1058        2204 :     if (abyBuffer.size() > 1024 * 1024 * 1024 - SOME_MARGIN)
    1059             :     {
    1060           0 :         CPLError(CE_Failure, CPLE_NotSupported,
    1061             :                  "Features larger than 1 GB are not supported");
    1062           0 :         return OGRERR_FAILURE;
    1063             :     }
    1064             : 
    1065        4408 :     OGRFeature oFeat(m_poTmpGPKGLayer->GetLayerDefn());
    1066        2204 :     oFeat.SetFID(nFID);
    1067        2204 :     oFeat.SetField(0, static_cast<int>(abyBuffer.size()), abyBuffer.data());
    1068        2204 :     const auto poSrcGeom = poFeature->GetGeometryRef();
    1069        2204 :     if (poSrcGeom && !poSrcGeom->IsEmpty())
    1070             :     {
    1071             :         // For the purpose of building an RTree, just use the bounding box of
    1072             :         // the geometry as the geometry.
    1073        1202 :         OGREnvelope sEnvelope;
    1074        1202 :         poSrcGeom->getEnvelope(&sEnvelope);
    1075        2404 :         auto poPoly = std::make_unique<OGRPolygon>();
    1076        2404 :         auto poLR = std::make_unique<OGRLinearRing>();
    1077        1202 :         poLR->addPoint(sEnvelope.MinX, sEnvelope.MinY);
    1078        1202 :         poLR->addPoint(sEnvelope.MinX, sEnvelope.MaxY);
    1079        1202 :         poLR->addPoint(sEnvelope.MaxX, sEnvelope.MaxY);
    1080        1202 :         poLR->addPoint(sEnvelope.MaxX, sEnvelope.MinY);
    1081        1202 :         poLR->addPoint(sEnvelope.MinX, sEnvelope.MinY);
    1082        1202 :         poPoly->addRingDirectly(poLR.release());
    1083        1202 :         oFeat.SetGeometryDirectly(poPoly.release());
    1084             :     }
    1085        2204 :     return m_poTmpGPKGLayer->CreateFeature(&oFeat);
    1086             : }
    1087             : 
    1088             : /************************************************************************/
    1089             : /*                            FlushGroup()                              */
    1090             : /************************************************************************/
    1091             : 
    1092         186 : bool OGRParquetWriterLayer::FlushGroup()
    1093             : {
    1094         372 :     auto status = m_poFileWriter->NewRowGroup(m_apoBuilders[0]->length());
    1095         186 :     if (!status.ok())
    1096             :     {
    1097           0 :         CPLError(CE_Failure, CPLE_AppDefined, "NewRowGroup() failed with %s",
    1098           0 :                  status.message().c_str());
    1099           0 :         ClearArrayBuilers();
    1100           0 :         return false;
    1101             :     }
    1102             : 
    1103         186 :     auto ret = WriteArrays(
    1104         845 :         [this](const std::shared_ptr<arrow::Field> &field,
    1105         845 :                const std::shared_ptr<arrow::Array> &array)
    1106             :         {
    1107        1690 :             auto l_status = m_poFileWriter->WriteColumnChunk(*array);
    1108         845 :             if (!l_status.ok())
    1109             :             {
    1110           0 :                 CPLError(CE_Failure, CPLE_AppDefined,
    1111             :                          "WriteColumnChunk() failed for field %s: %s",
    1112           0 :                          field->name().c_str(), l_status.message().c_str());
    1113           0 :                 return false;
    1114             :             }
    1115         845 :             return true;
    1116             :         });
    1117             : 
    1118         186 :     ClearArrayBuilers();
    1119         186 :     return ret;
    1120             : }
    1121             : 
    1122             : /************************************************************************/
    1123             : /*                    FixupWKBGeometryBeforeWriting()                   */
    1124             : /************************************************************************/
    1125             : 
    1126          25 : void OGRParquetWriterLayer::FixupWKBGeometryBeforeWriting(GByte *pabyWkb,
    1127             :                                                           size_t nLen)
    1128             : {
    1129          25 :     if (!m_bForceCounterClockwiseOrientation)
    1130           0 :         return;
    1131             : 
    1132          25 :     OGRWKBFixupCounterClockWiseExternalRing(pabyWkb, nLen);
    1133             : }
    1134             : 
    1135             : /************************************************************************/
    1136             : /*                     FixupGeometryBeforeWriting()                     */
    1137             : /************************************************************************/
    1138             : 
    1139        1329 : void OGRParquetWriterLayer::FixupGeometryBeforeWriting(OGRGeometry *poGeom)
    1140             : {
    1141        1329 :     if (!m_bForceCounterClockwiseOrientation)
    1142           3 :         return;
    1143             : 
    1144        1326 :     const auto eFlattenType = wkbFlatten(poGeom->getGeometryType());
    1145             :     // Polygon rings MUST follow the right-hand rule for orientation
    1146             :     // (counterclockwise external rings, clockwise internal rings)
    1147        1326 :     if (eFlattenType == wkbPolygon)
    1148             :     {
    1149          44 :         bool bFirstRing = true;
    1150          91 :         for (auto poRing : poGeom->toPolygon())
    1151             :         {
    1152          55 :             if ((bFirstRing && poRing->isClockwise()) ||
    1153           8 :                 (!bFirstRing && !poRing->isClockwise()))
    1154             :             {
    1155          42 :                 poRing->reverseWindingOrder();
    1156             :             }
    1157          47 :             bFirstRing = false;
    1158             :         }
    1159             :     }
    1160        1282 :     else if (eFlattenType == wkbMultiPolygon ||
    1161             :              eFlattenType == wkbGeometryCollection)
    1162             :     {
    1163          35 :         for (auto poSubGeom : poGeom->toGeometryCollection())
    1164             :         {
    1165          21 :             FixupGeometryBeforeWriting(poSubGeom);
    1166             :         }
    1167             :     }
    1168             : }
    1169             : 
    1170             : /************************************************************************/
    1171             : /*                          WriteArrowBatch()                           */
    1172             : /************************************************************************/
    1173             : 
    1174             : #if PARQUET_VERSION_MAJOR > 10
    1175             : inline bool
    1176           9 : OGRParquetWriterLayer::WriteArrowBatch(const struct ArrowSchema *schema,
    1177             :                                        struct ArrowArray *array,
    1178             :                                        CSLConstList papszOptions)
    1179             : {
    1180           9 :     if (m_poTmpGPKGLayer)
    1181             :     {
    1182             :         // When using SORT_BY_BBOX=YES option, we can't directly write the
    1183             :         // input array, because we need to sort features. Hence we fallback
    1184             :         // to the OGRLayer base implementation, which will ultimately call
    1185             :         // OGRParquetWriterLayer::ICreateFeature()
    1186           0 :         return OGRLayer::WriteArrowBatch(schema, array, papszOptions);
    1187             :     }
    1188             : 
    1189          18 :     return WriteArrowBatchInternal(
    1190             :         schema, array, papszOptions,
    1191          18 :         [this](const std::shared_ptr<arrow::RecordBatch> &poBatch)
    1192             :         {
    1193          18 :             auto status = m_poFileWriter->NewBufferedRowGroup();
    1194           9 :             if (!status.ok())
    1195             :             {
    1196           0 :                 CPLError(CE_Failure, CPLE_AppDefined,
    1197             :                          "NewBufferedRowGroup() failed with %s",
    1198           0 :                          status.message().c_str());
    1199           0 :                 return false;
    1200             :             }
    1201             : 
    1202           9 :             status = m_poFileWriter->WriteRecordBatch(*poBatch);
    1203           9 :             if (!status.ok())
    1204             :             {
    1205           0 :                 CPLError(CE_Failure, CPLE_AppDefined,
    1206             :                          "WriteRecordBatch() failed: %s",
    1207           0 :                          status.message().c_str());
    1208           0 :                 return false;
    1209             :             }
    1210             : 
    1211           9 :             return true;
    1212           9 :         });
    1213             : }
    1214             : #endif
    1215             : 
    1216             : /************************************************************************/
    1217             : /*                         TestCapability()                             */
    1218             : /************************************************************************/
    1219             : 
    1220         352 : inline int OGRParquetWriterLayer::TestCapability(const char *pszCap)
    1221             : {
    1222             : #if PARQUET_VERSION_MAJOR <= 10
    1223             :     if (EQUAL(pszCap, OLCFastWriteArrowBatch))
    1224             :         return false;
    1225             : #endif
    1226             : 
    1227         352 :     if (m_poTmpGPKGLayer && EQUAL(pszCap, OLCFastWriteArrowBatch))
    1228             :     {
    1229             :         // When using SORT_BY_BBOX=YES option, we can't directly write the
    1230             :         // input array, because we need to sort features. So this is not
    1231             :         // fast
    1232           1 :         return false;
    1233             :     }
    1234             : 
    1235         351 :     return OGRArrowWriterLayer::TestCapability(pszCap);
    1236             : }
    1237             : 
    1238             : /************************************************************************/
    1239             : /*                        CreateFieldFromArrowSchema()                  */
    1240             : /************************************************************************/
    1241             : 
    1242             : #if PARQUET_VERSION_MAJOR > 10
    1243         237 : bool OGRParquetWriterLayer::CreateFieldFromArrowSchema(
    1244             :     const struct ArrowSchema *schema, CSLConstList papszOptions)
    1245             : {
    1246         237 :     if (m_poTmpGPKGLayer)
    1247             :     {
    1248             :         // When using SORT_BY_BBOX=YES option, we can't directly write the
    1249             :         // input array, because we need to sort features. But this process
    1250             :         // only supports the base Arrow types supported by
    1251             :         // OGRLayer::WriteArrowBatch()
    1252           0 :         return OGRLayer::CreateFieldFromArrowSchema(schema, papszOptions);
    1253             :     }
    1254             : 
    1255         237 :     return OGRArrowWriterLayer::CreateFieldFromArrowSchema(schema,
    1256         237 :                                                            papszOptions);
    1257             : }
    1258             : #endif
    1259             : 
    1260             : /************************************************************************/
    1261             : /*                        IsArrowSchemaSupported()                      */
    1262             : /************************************************************************/
    1263             : 
    1264             : #if PARQUET_VERSION_MAJOR > 10
    1265         716 : bool OGRParquetWriterLayer::IsArrowSchemaSupported(
    1266             :     const struct ArrowSchema *schema, CSLConstList papszOptions,
    1267             :     std::string &osErrorMsg) const
    1268             : {
    1269         716 :     if (m_poTmpGPKGLayer)
    1270             :     {
    1271             :         // When using SORT_BY_BBOX=YES option, we can't directly write the
    1272             :         // input array, because we need to sort features. But this process
    1273             :         // only supports the base Arrow types supported by
    1274             :         // OGRLayer::WriteArrowBatch()
    1275           0 :         return OGRLayer::IsArrowSchemaSupported(schema, papszOptions,
    1276           0 :                                                 osErrorMsg);
    1277             :     }
    1278             : 
    1279         716 :     if (schema->format[0] == 'e' && schema->format[1] == 0)
    1280             :     {
    1281           1 :         osErrorMsg = "float16 not supported";
    1282           1 :         return false;
    1283             :     }
    1284        1423 :     for (int64_t i = 0; i < schema->n_children; ++i)
    1285             :     {
    1286         709 :         if (!IsArrowSchemaSupported(schema->children[i], papszOptions,
    1287             :                                     osErrorMsg))
    1288             :         {
    1289           1 :             return false;
    1290             :         }
    1291             :     }
    1292         714 :     return true;
    1293             : }
    1294             : #endif
    1295             : 
    1296             : /************************************************************************/
    1297             : /*                            SetMetadata()                             */
    1298             : /************************************************************************/
    1299             : 
    1300           6 : CPLErr OGRParquetWriterLayer::SetMetadata(char **papszMetadata,
    1301             :                                           const char *pszDomain)
    1302             : {
    1303           6 :     if (!pszDomain || !EQUAL(pszDomain, "SHAPEFILE"))
    1304             :     {
    1305           4 :         return OGRLayer::SetMetadata(papszMetadata, pszDomain);
    1306             :     }
    1307           2 :     return CE_None;
    1308             : }
    1309             : 
    1310             : /************************************************************************/
    1311             : /*                             GetDataset()                             */
    1312             : /************************************************************************/
    1313             : 
    1314          17 : GDALDataset *OGRParquetWriterLayer::GetDataset()
    1315             : {
    1316          17 :     return m_poDataset;
    1317             : }

Generated by: LCOV version 1.14