LCOV - code coverage report
Current view: top level - ogr/ogrsf_frmts/parquet - ogrparquetdataset.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 114 128 89.1 %
Date: 2024-05-14 23:54:21 Functions: 4 4 100.0 %

          Line data    Source code
       1             : /******************************************************************************
       2             :  *
       3             :  * Project:  Parquet Translator
       4             :  * Purpose:  Implements OGRParquetDriver.
       5             :  * Author:   Even Rouault, <even.rouault at spatialys.com>
       6             :  *
       7             :  ******************************************************************************
       8             :  * Copyright (c) 2022, Planet Labs
       9             :  *
      10             :  * Permission is hereby granted, free of charge, to any person obtaining a
      11             :  * copy of this software and associated documentation files (the "Software"),
      12             :  * to deal in the Software without restriction, including without limitation
      13             :  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      14             :  * and/or sell copies of the Software, and to permit persons to whom the
      15             :  * Software is furnished to do so, subject to the following conditions:
      16             :  *
      17             :  * The above copyright notice and this permission notice shall be included
      18             :  * in all copies or substantial portions of the Software.
      19             :  *
      20             :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
      21             :  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
      22             :  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
      23             :  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
      24             :  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
      25             :  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
      26             :  * DEALINGS IN THE SOFTWARE.
      27             :  ****************************************************************************/
      28             : 
      29             : #include "ogr_parquet.h"
      30             : #include "ogr_mem.h"
      31             : #include "ogr_swq.h"
      32             : 
      33             : #include "../arrow_common/ograrrowdataset.hpp"
      34             : #include "../arrow_common/ograrrowlayer.hpp"
      35             : 
      36             : /************************************************************************/
      37             : /*                         OGRParquetDataset()                          */
      38             : /************************************************************************/
      39             : 
      40         681 : OGRParquetDataset::OGRParquetDataset(
      41         681 :     const std::shared_ptr<arrow::MemoryPool> &poMemoryPool)
      42         681 :     : OGRArrowDataset(poMemoryPool)
      43             : {
      44         681 : }
      45             : 
      46             : /***********************************************************************/
      47             : /*                            ExecuteSQL()                             */
      48             : /***********************************************************************/
      49             : 
      50          40 : OGRLayer *OGRParquetDataset::ExecuteSQL(const char *pszSQLCommand,
      51             :                                         OGRGeometry *poSpatialFilter,
      52             :                                         const char *pszDialect)
      53             : {
      54             :     /* -------------------------------------------------------------------- */
      55             :     /*      Special cases for SQL optimizations                             */
      56             :     /* -------------------------------------------------------------------- */
      57          40 :     if (STARTS_WITH_CI(pszSQLCommand, "SELECT ") &&
      58          10 :         (pszDialect == nullptr || EQUAL(pszDialect, "") ||
      59           0 :          EQUAL(pszDialect, "OGRSQL")))
      60             :     {
      61          35 :         swq_select oSelect;
      62          35 :         if (oSelect.preparse(pszSQLCommand) != CE_None)
      63           0 :             return nullptr;
      64             : 
      65             :         /* --------------------------------------------------------------------
      66             :          */
      67             :         /*      MIN/MAX/COUNT optimization */
      68             :         /* --------------------------------------------------------------------
      69             :          */
      70          35 :         if (oSelect.join_count == 0 && oSelect.poOtherSelect == nullptr &&
      71          35 :             oSelect.table_count == 1 && oSelect.order_specs == 0 &&
      72          35 :             oSelect.query_mode != SWQM_DISTINCT_LIST &&
      73          93 :             oSelect.where_expr == nullptr &&
      74          23 :             CPLTestBool(
      75             :                 CPLGetConfigOption("OGR_PARQUET_USE_STATISTICS", "YES")))
      76             :         {
      77           1 :             auto poLayer = dynamic_cast<OGRParquetLayer *>(
      78          23 :                 GetLayerByName(oSelect.table_defs[0].table_name));
      79          23 :             if (poLayer)
      80             :             {
      81          16 :                 OGRMemLayer *poMemLayer = nullptr;
      82          16 :                 const auto poLayerDefn = poLayer->GetLayerDefn();
      83             : 
      84          16 :                 int i = 0;  // Used after for.
      85          47 :                 for (; i < oSelect.result_columns(); i++)
      86             :                 {
      87          44 :                     swq_col_func col_func = oSelect.column_defs[i].col_func;
      88          44 :                     if (!(col_func == SWQCF_MIN || col_func == SWQCF_MAX ||
      89             :                           col_func == SWQCF_COUNT))
      90          13 :                         break;
      91             : 
      92             :                     const char *pszFieldName =
      93          37 :                         oSelect.column_defs[i].field_name;
      94          37 :                     if (pszFieldName == nullptr)
      95           0 :                         break;
      96          37 :                     if (oSelect.column_defs[i].target_type != SWQ_OTHER)
      97           0 :                         break;
      98             : 
      99             :                     const int iOGRField =
     100          37 :                         (EQUAL(pszFieldName, poLayer->GetFIDColumn()) &&
     101           2 :                          pszFieldName[0])
     102          39 :                             ? OGRParquetLayer::OGR_FID_INDEX
     103          35 :                             : poLayerDefn->GetFieldIndex(pszFieldName);
     104          37 :                     if (iOGRField < 0 &&
     105             :                         iOGRField != OGRParquetLayer::OGR_FID_INDEX)
     106           4 :                         break;
     107             : 
     108             :                     OGRField sField;
     109          33 :                     OGR_RawField_SetNull(&sField);
     110          33 :                     OGRFieldType eType = OFTReal;
     111          33 :                     OGRFieldSubType eSubType = OFSTNone;
     112             :                     const int iCol =
     113             :                         iOGRField == OGRParquetLayer::OGR_FID_INDEX
     114          64 :                             ? poLayer->GetFIDParquetColumn()
     115          31 :                             : poLayer->GetMapFieldIndexToParquetColumn()
     116          31 :                                   [iOGRField];
     117          33 :                     if (iCol < 0)
     118           0 :                         break;
     119             :                     const auto metadata =
     120          33 :                         poLayer->GetReader()->parquet_reader()->metadata();
     121          33 :                     const auto numRowGroups = metadata->num_row_groups();
     122          33 :                     bool bFound = false;
     123          33 :                     std::string sVal;
     124             : 
     125          33 :                     if (numRowGroups > 0)
     126             :                     {
     127             :                         const auto rowGroup0columnChunk =
     128          66 :                             metadata->RowGroup(0)->ColumnChunk(iCol);
     129             :                         const auto rowGroup0Stats =
     130          66 :                             rowGroup0columnChunk->statistics();
     131          65 :                         if (rowGroup0columnChunk->is_stats_set() &&
     132          32 :                             rowGroup0Stats)
     133             :                         {
     134             :                             OGRField sFieldDummy;
     135             :                             bool bFoundDummy;
     136          64 :                             std::string sValDummy;
     137             : 
     138          32 :                             if (col_func == SWQCF_MIN)
     139             :                             {
     140          15 :                                 CPL_IGNORE_RET_VAL(
     141          15 :                                     poLayer->GetMinMaxForOGRField(
     142             :                                         /* iRowGroup=*/-1,  // -1 for all
     143             :                                         iOGRField, true, sField, bFound, false,
     144             :                                         sFieldDummy, bFoundDummy, eType,
     145             :                                         eSubType, sVal, sValDummy));
     146             :                             }
     147          17 :                             else if (col_func == SWQCF_MAX)
     148             :                             {
     149          15 :                                 CPL_IGNORE_RET_VAL(
     150          15 :                                     poLayer->GetMinMaxForOGRField(
     151             :                                         /* iRowGroup=*/-1,  // -1 for all
     152             :                                         iOGRField, false, sFieldDummy,
     153             :                                         bFoundDummy, true, sField, bFound,
     154             :                                         eType, eSubType, sValDummy, sVal));
     155             :                             }
     156           2 :                             else if (col_func == SWQCF_COUNT)
     157             :                             {
     158           2 :                                 if (oSelect.column_defs[i].distinct_flag)
     159             :                                 {
     160           1 :                                     eType = OFTInteger64;
     161           1 :                                     sField.Integer64 = 0;
     162           1 :                                     for (int iGroup = 0; iGroup < numRowGroups;
     163             :                                          iGroup++)
     164             :                                     {
     165             :                                         const auto columnChunk =
     166           1 :                                             metadata->RowGroup(iGroup)
     167           1 :                                                 ->ColumnChunk(iCol);
     168             :                                         const auto colStats =
     169           1 :                                             columnChunk->statistics();
     170           2 :                                         if (columnChunk->is_stats_set() &&
     171           2 :                                             colStats &&
     172           1 :                                             colStats->HasDistinctCount())
     173             :                                         {
     174             :                                             // Statistics generated by arrow-cpp
     175             :                                             // Parquet writer seem to be buggy,
     176             :                                             // as distinct_count() is always
     177             :                                             // zero. We can detect this: if
     178             :                                             // there are non-null values, then
     179             :                                             // distinct_count() should be > 0.
     180           0 :                                             if (colStats->distinct_count() ==
     181           0 :                                                     0 &&
     182           0 :                                                 colStats->num_values() > 0)
     183             :                                             {
     184           0 :                                                 bFound = false;
     185           0 :                                                 break;
     186             :                                             }
     187           0 :                                             sField.Integer64 +=
     188           0 :                                                 colStats->distinct_count();
     189           0 :                                             bFound = true;
     190             :                                         }
     191             :                                         else
     192             :                                         {
     193           1 :                                             bFound = false;
     194           1 :                                             break;
     195             :                                         }
     196             :                                     }
     197             :                                 }
     198             :                                 else
     199             :                                 {
     200           1 :                                     eType = OFTInteger64;
     201           1 :                                     sField.Integer64 = 0;
     202           1 :                                     bFound = true;
     203           3 :                                     for (int iGroup = 0; iGroup < numRowGroups;
     204             :                                          iGroup++)
     205             :                                     {
     206             :                                         const auto columnChunk =
     207           2 :                                             metadata->RowGroup(iGroup)
     208           4 :                                                 ->ColumnChunk(iCol);
     209             :                                         const auto colStats =
     210           4 :                                             columnChunk->statistics();
     211           4 :                                         if (columnChunk->is_stats_set() &&
     212           2 :                                             colStats)
     213             :                                         {
     214           2 :                                             sField.Integer64 +=
     215           2 :                                                 colStats->num_values();
     216             :                                         }
     217             :                                         else
     218             :                                         {
     219           0 :                                             bFound = false;
     220             :                                         }
     221             :                                     }
     222             :                                 }
     223             :                             }
     224             :                         }
     225             :                         else
     226             :                         {
     227           1 :                             CPLDebug("PARQUET",
     228             :                                      "Statistics not available for field %s",
     229             :                                      pszFieldName);
     230             :                         }
     231             :                     }
     232          33 :                     if (!bFound)
     233             :                     {
     234           2 :                         break;
     235             :                     }
     236             : 
     237          31 :                     if (poMemLayer == nullptr)
     238             :                     {
     239           3 :                         poMemLayer =
     240           3 :                             new OGRMemLayer("SELECT", nullptr, wkbNone);
     241             :                         OGRFeature *poFeature =
     242           3 :                             new OGRFeature(poMemLayer->GetLayerDefn());
     243           3 :                         CPL_IGNORE_RET_VAL(
     244           3 :                             poMemLayer->CreateFeature(poFeature));
     245           3 :                         delete poFeature;
     246             :                     }
     247             : 
     248             :                     const char *pszMinMaxFieldName =
     249          47 :                         CPLSPrintf("%s_%s",
     250             :                                    (col_func == SWQCF_MIN)   ? "MIN"
     251          16 :                                    : (col_func == SWQCF_MAX) ? "MAX"
     252             :                                                              : "COUNT",
     253          31 :                                    oSelect.column_defs[i].field_name);
     254          62 :                     OGRFieldDefn oFieldDefn(pszMinMaxFieldName, eType);
     255          31 :                     oFieldDefn.SetSubType(eSubType);
     256          31 :                     poMemLayer->CreateField(&oFieldDefn);
     257             : 
     258          31 :                     OGRFeature *poFeature = poMemLayer->GetFeature(0);
     259          31 :                     poFeature->SetField(oFieldDefn.GetNameRef(), &sField);
     260          31 :                     CPL_IGNORE_RET_VAL(poMemLayer->SetFeature(poFeature));
     261          31 :                     delete poFeature;
     262             :                 }
     263          16 :                 if (i != oSelect.result_columns())
     264             :                 {
     265          13 :                     delete poMemLayer;
     266             :                 }
     267             :                 else
     268             :                 {
     269           3 :                     CPLDebug("PARQUET",
     270             :                              "Using optimized MIN/MAX/COUNT implementation");
     271           3 :                     return poMemLayer;
     272             :                 }
     273             :             }
     274             :         }
     275             :     }
     276             : 
     277          37 :     return GDALDataset::ExecuteSQL(pszSQLCommand, poSpatialFilter, pszDialect);
     278             : }
     279             : 
     280             : /***********************************************************************/
     281             : /*                           ReleaseResultSet()                        */
     282             : /***********************************************************************/
     283             : 
     284          32 : void OGRParquetDataset::ReleaseResultSet(OGRLayer *poResultsSet)
     285             : {
     286          32 :     delete poResultsSet;
     287          32 : }
     288             : 
     289             : /************************************************************************/
     290             : /*                           TestCapability()                           */
     291             : /************************************************************************/
     292             : 
     293          48 : int OGRParquetDataset::TestCapability(const char *pszCap)
     294             : 
     295             : {
     296          48 :     if (EQUAL(pszCap, ODsCZGeometries))
     297           5 :         return true;
     298          43 :     else if (EQUAL(pszCap, ODsCMeasuredGeometries))
     299          10 :         return true;
     300             : 
     301          33 :     return false;
     302             : }

Generated by: LCOV version 1.14