LCOV - code coverage report
Current view: top level - ogr/ogrsf_frmts/parquet - ogrparquetdataset.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 127 141 90.1 %
Date: 2025-09-10 17:48:50 Functions: 7 7 100.0 %

          Line data    Source code
       1             : /******************************************************************************
       2             :  *
       3             :  * Project:  Parquet Translator
       4             :  * Purpose:  Implements OGRParquetDriver.
       5             :  * Author:   Even Rouault, <even.rouault at spatialys.com>
       6             :  *
       7             :  ******************************************************************************
       8             :  * Copyright (c) 2022, Planet Labs
       9             :  *
      10             :  * SPDX-License-Identifier: MIT
      11             :  ****************************************************************************/
      12             : 
      13             : #include "ogr_parquet.h"
      14             : #include "memdataset.h"
      15             : #include "ogr_swq.h"
      16             : 
      17             : #include "../arrow_common/ograrrowdataset.hpp"
      18             : #include "../arrow_common/ograrrowlayer.hpp"
      19             : #include "../arrow_common/vsiarrowfilesystem.hpp"
      20             : 
      21             : /************************************************************************/
      22             : /*                         OGRParquetDataset()                          */
      23             : /************************************************************************/
      24             : 
      25        1147 : OGRParquetDataset::OGRParquetDataset(
      26        1147 :     const std::shared_ptr<arrow::MemoryPool> &poMemoryPool)
      27        1147 :     : OGRArrowDataset(poMemoryPool)
      28             : {
      29        1147 : }
      30             : 
      31             : /************************************************************************/
      32             : /*                        ~OGRParquetDataset()                          */
      33             : /************************************************************************/
      34             : 
      35        2294 : OGRParquetDataset::~OGRParquetDataset()
      36             : {
      37        1147 :     OGRParquetDataset::Close();
      38        2294 : }
      39             : 
      40             : /************************************************************************/
      41             : /*                                Close()                               */
      42             : /************************************************************************/
      43             : 
      44        2279 : CPLErr OGRParquetDataset::Close()
      45             : {
      46        2279 :     CPLErr eErr = CE_None;
      47        2279 :     if (nOpenFlags != OPEN_FLAGS_CLOSED)
      48             :     {
      49             :         // libarrow might continue to do I/O in auxiliary threads on the underlying
      50             :         // files when using the arrow::dataset API even after we closed the dataset.
      51             :         // This is annoying as it can cause crashes when closing GDAL, in particular
      52             :         // the virtual file manager, as this could result in VSI files being
      53             :         // accessed after their VSIVirtualFileSystem has been destroyed, resulting
      54             :         // in crashes. The workaround is to make sure that VSIArrowFileSystem
      55             :         // waits for all file handles it is aware of to have been destroyed.
      56        1147 :         eErr = OGRArrowDataset::Close();
      57             : 
      58        2294 :         auto poFS = std::dynamic_pointer_cast<VSIArrowFileSystem>(m_poFS);
      59        1147 :         if (poFS)
      60         266 :             poFS->AskToClose();
      61             :     }
      62             : 
      63        2279 :     return eErr;
      64             : }
      65             : 
      66             : /***********************************************************************/
      67             : /*                            ExecuteSQL()                             */
      68             : /***********************************************************************/
      69             : 
      70          46 : OGRLayer *OGRParquetDataset::ExecuteSQL(const char *pszSQLCommand,
      71             :                                         OGRGeometry *poSpatialFilter,
      72             :                                         const char *pszDialect)
      73             : {
      74             :     /* -------------------------------------------------------------------- */
      75             :     /*      Special cases for SQL optimizations                             */
      76             :     /* -------------------------------------------------------------------- */
      77          46 :     if (STARTS_WITH_CI(pszSQLCommand, "SELECT ") &&
      78          10 :         (pszDialect == nullptr || EQUAL(pszDialect, "") ||
      79           0 :          EQUAL(pszDialect, "OGRSQL")))
      80             :     {
      81          40 :         swq_select oSelect;
      82          40 :         if (oSelect.preparse(pszSQLCommand) != CE_None)
      83           0 :             return nullptr;
      84             : 
      85             :         /* --------------------------------------------------------------------
      86             :          */
      87             :         /*      MIN/MAX/COUNT optimization */
      88             :         /* --------------------------------------------------------------------
      89             :          */
      90          40 :         if (oSelect.join_count == 0 && oSelect.poOtherSelect == nullptr &&
      91          40 :             oSelect.table_count == 1 && oSelect.order_specs == 0 &&
      92          40 :             oSelect.query_mode != SWQM_DISTINCT_LIST &&
      93         106 :             oSelect.where_expr == nullptr &&
      94          26 :             CPLTestBool(
      95             :                 CPLGetConfigOption("OGR_PARQUET_USE_STATISTICS", "YES")))
      96             :         {
      97           1 :             auto poLayer = dynamic_cast<OGRParquetLayer *>(
      98          26 :                 GetLayerByName(oSelect.table_defs[0].table_name));
      99          26 :             if (poLayer)
     100             :             {
     101          16 :                 OGRMemLayer *poMemLayer = nullptr;
     102          16 :                 const auto poLayerDefn = poLayer->GetLayerDefn();
     103             : 
     104          16 :                 int i = 0;  // Used after for.
     105          47 :                 for (; i < oSelect.result_columns(); i++)
     106             :                 {
     107          44 :                     swq_col_func col_func = oSelect.column_defs[i].col_func;
     108          44 :                     if (!(col_func == SWQCF_MIN || col_func == SWQCF_MAX ||
     109             :                           col_func == SWQCF_COUNT))
     110          13 :                         break;
     111             : 
     112             :                     const char *pszFieldName =
     113          37 :                         oSelect.column_defs[i].field_name;
     114          37 :                     if (pszFieldName == nullptr)
     115           0 :                         break;
     116          37 :                     if (oSelect.column_defs[i].target_type != SWQ_OTHER)
     117           0 :                         break;
     118             : 
     119             :                     const int iOGRField =
     120          37 :                         (EQUAL(pszFieldName, poLayer->GetFIDColumn()) &&
     121           2 :                          pszFieldName[0])
     122          39 :                             ? OGRParquetLayer::OGR_FID_INDEX
     123          35 :                             : poLayerDefn->GetFieldIndex(pszFieldName);
     124          37 :                     if (iOGRField < 0 &&
     125             :                         iOGRField != OGRParquetLayer::OGR_FID_INDEX)
     126           4 :                         break;
     127             : 
     128             :                     OGRField sField;
     129          33 :                     OGR_RawField_SetNull(&sField);
     130          33 :                     OGRFieldType eType = OFTReal;
     131          33 :                     OGRFieldSubType eSubType = OFSTNone;
     132             :                     const int iCol =
     133             :                         iOGRField == OGRParquetLayer::OGR_FID_INDEX
     134          64 :                             ? poLayer->GetFIDParquetColumn()
     135          31 :                             : poLayer->GetMapFieldIndexToParquetColumn()
     136          31 :                                   [iOGRField];
     137          33 :                     if (iCol < 0)
     138           0 :                         break;
     139             :                     const auto metadata =
     140          33 :                         poLayer->GetReader()->parquet_reader()->metadata();
     141          33 :                     const auto numRowGroups = metadata->num_row_groups();
     142          33 :                     bool bFound = false;
     143          33 :                     std::string sVal;
     144             : 
     145          33 :                     if (numRowGroups > 0)
     146             :                     {
     147             :                         const auto rowGroup0columnChunk =
     148          66 :                             metadata->RowGroup(0)->ColumnChunk(iCol);
     149             :                         const auto rowGroup0Stats =
     150          66 :                             rowGroup0columnChunk->statistics();
     151          65 :                         if (rowGroup0columnChunk->is_stats_set() &&
     152          32 :                             rowGroup0Stats)
     153             :                         {
     154             :                             OGRField sFieldDummy;
     155             :                             bool bFoundDummy;
     156          64 :                             std::string sValDummy;
     157             : 
     158          32 :                             if (col_func == SWQCF_MIN)
     159             :                             {
     160          15 :                                 CPL_IGNORE_RET_VAL(
     161          15 :                                     poLayer->GetMinMaxForOGRField(
     162             :                                         /* iRowGroup=*/-1,  // -1 for all
     163             :                                         iOGRField, true, sField, bFound, false,
     164             :                                         sFieldDummy, bFoundDummy, eType,
     165             :                                         eSubType, sVal, sValDummy));
     166             :                             }
     167          17 :                             else if (col_func == SWQCF_MAX)
     168             :                             {
     169          15 :                                 CPL_IGNORE_RET_VAL(
     170          15 :                                     poLayer->GetMinMaxForOGRField(
     171             :                                         /* iRowGroup=*/-1,  // -1 for all
     172             :                                         iOGRField, false, sFieldDummy,
     173             :                                         bFoundDummy, true, sField, bFound,
     174             :                                         eType, eSubType, sValDummy, sVal));
     175             :                             }
     176           2 :                             else if (col_func == SWQCF_COUNT)
     177             :                             {
     178           2 :                                 if (oSelect.column_defs[i].distinct_flag)
     179             :                                 {
     180           1 :                                     eType = OFTInteger64;
     181           1 :                                     sField.Integer64 = 0;
     182           1 :                                     for (int iGroup = 0; iGroup < numRowGroups;
     183             :                                          iGroup++)
     184             :                                     {
     185             :                                         const auto columnChunk =
     186           1 :                                             metadata->RowGroup(iGroup)
     187           1 :                                                 ->ColumnChunk(iCol);
     188             :                                         const auto colStats =
     189           1 :                                             columnChunk->statistics();
     190           2 :                                         if (columnChunk->is_stats_set() &&
     191           2 :                                             colStats &&
     192           1 :                                             colStats->HasDistinctCount())
     193             :                                         {
     194             :                                             // Statistics generated by arrow-cpp
     195             :                                             // Parquet writer seem to be buggy,
     196             :                                             // as distinct_count() is always
     197             :                                             // zero. We can detect this: if
     198             :                                             // there are non-null values, then
     199             :                                             // distinct_count() should be > 0.
     200           0 :                                             if (colStats->distinct_count() ==
     201           0 :                                                     0 &&
     202           0 :                                                 colStats->num_values() > 0)
     203             :                                             {
     204           0 :                                                 bFound = false;
     205           0 :                                                 break;
     206             :                                             }
     207           0 :                                             sField.Integer64 +=
     208           0 :                                                 colStats->distinct_count();
     209           0 :                                             bFound = true;
     210             :                                         }
     211             :                                         else
     212             :                                         {
     213           1 :                                             bFound = false;
     214           1 :                                             break;
     215             :                                         }
     216             :                                     }
     217             :                                 }
     218             :                                 else
     219             :                                 {
     220           1 :                                     eType = OFTInteger64;
     221           1 :                                     sField.Integer64 = 0;
     222           1 :                                     bFound = true;
     223           3 :                                     for (int iGroup = 0; iGroup < numRowGroups;
     224             :                                          iGroup++)
     225             :                                     {
     226             :                                         const auto columnChunk =
     227           2 :                                             metadata->RowGroup(iGroup)
     228           4 :                                                 ->ColumnChunk(iCol);
     229             :                                         const auto colStats =
     230           4 :                                             columnChunk->statistics();
     231           4 :                                         if (columnChunk->is_stats_set() &&
     232           2 :                                             colStats)
     233             :                                         {
     234           2 :                                             sField.Integer64 +=
     235           2 :                                                 colStats->num_values();
     236             :                                         }
     237             :                                         else
     238             :                                         {
     239           0 :                                             bFound = false;
     240             :                                         }
     241             :                                     }
     242             :                                 }
     243             :                             }
     244             :                         }
     245             :                         else
     246             :                         {
     247           1 :                             CPLDebug("PARQUET",
     248             :                                      "Statistics not available for field %s",
     249             :                                      pszFieldName);
     250             :                         }
     251             :                     }
     252          33 :                     if (!bFound)
     253             :                     {
     254           2 :                         break;
     255             :                     }
     256             : 
     257          31 :                     if (poMemLayer == nullptr)
     258             :                     {
     259           3 :                         poMemLayer =
     260           3 :                             new OGRMemLayer("SELECT", nullptr, wkbNone);
     261             :                         OGRFeature *poFeature =
     262           3 :                             new OGRFeature(poMemLayer->GetLayerDefn());
     263           3 :                         CPL_IGNORE_RET_VAL(
     264           3 :                             poMemLayer->CreateFeature(poFeature));
     265           3 :                         delete poFeature;
     266             :                     }
     267             : 
     268             :                     const char *pszMinMaxFieldName =
     269          31 :                         oSelect.column_defs[i].field_alias
     270          31 :                             ? oSelect.column_defs[i].field_alias
     271          21 :                             : CPLSPrintf("%s_%s",
     272             :                                          (col_func == SWQCF_MIN)   ? "MIN"
     273           3 :                                          : (col_func == SWQCF_MAX) ? "MAX"
     274             :                                                                    : "COUNT",
     275          18 :                                          oSelect.column_defs[i].field_name);
     276          62 :                     OGRFieldDefn oFieldDefn(pszMinMaxFieldName, eType);
     277          31 :                     oFieldDefn.SetSubType(eSubType);
     278          31 :                     poMemLayer->CreateField(&oFieldDefn);
     279             : 
     280          31 :                     OGRFeature *poFeature = poMemLayer->GetFeature(0);
     281          31 :                     poFeature->SetField(oFieldDefn.GetNameRef(), &sField);
     282          31 :                     CPL_IGNORE_RET_VAL(poMemLayer->SetFeature(poFeature));
     283          31 :                     delete poFeature;
     284             :                 }
     285          16 :                 if (i != oSelect.result_columns())
     286             :                 {
     287          13 :                     delete poMemLayer;
     288             :                 }
     289             :                 else
     290             :                 {
     291           3 :                     CPLDebug("PARQUET",
     292             :                              "Using optimized MIN/MAX/COUNT implementation");
     293           3 :                     return poMemLayer;
     294             :                 }
     295             :             }
     296             :         }
     297             :     }
     298             : 
     299          43 :     return GDALDataset::ExecuteSQL(pszSQLCommand, poSpatialFilter, pszDialect);
     300             : }
     301             : 
     302             : /***********************************************************************/
     303             : /*                           ReleaseResultSet()                        */
     304             : /***********************************************************************/
     305             : 
     306          37 : void OGRParquetDataset::ReleaseResultSet(OGRLayer *poResultsSet)
     307             : {
     308          37 :     delete poResultsSet;
     309          37 : }
     310             : 
     311             : /************************************************************************/
     312             : /*                           TestCapability()                           */
     313             : /************************************************************************/
     314             : 
     315          64 : int OGRParquetDataset::TestCapability(const char *pszCap) const
     316             : 
     317             : {
     318          64 :     if (EQUAL(pszCap, ODsCZGeometries))
     319           6 :         return true;
     320          58 :     else if (EQUAL(pszCap, ODsCMeasuredGeometries))
     321          12 :         return true;
     322             : 
     323          46 :     return false;
     324             : }

Generated by: LCOV version 1.14