Line data Source code
1 : /******************************************************************************
2 : *
3 : * Project: Parquet Translator
4 : * Purpose: Implements OGRParquetDriver.
5 : * Author: Even Rouault, <even.rouault at spatialys.com>
6 : *
7 : ******************************************************************************
8 : * Copyright (c) 2022, Planet Labs
9 : *
10 : * Permission is hereby granted, free of charge, to any person obtaining a
11 : * copy of this software and associated documentation files (the "Software"),
12 : * to deal in the Software without restriction, including without limitation
13 : * the rights to use, copy, modify, merge, publish, distribute, sublicense,
14 : * and/or sell copies of the Software, and to permit persons to whom the
15 : * Software is furnished to do so, subject to the following conditions:
16 : *
17 : * The above copyright notice and this permission notice shall be included
18 : * in all copies or substantial portions of the Software.
19 : *
20 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
21 : * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 : * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 : * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 : * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 : * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 : * DEALINGS IN THE SOFTWARE.
27 : ****************************************************************************/
28 :
29 : #undef DO_NOT_DEFINE_GDAL_DATE_NAME
30 : #include "gdal_version_full/gdal_version.h"
31 :
32 : #include "ogr_parquet.h"
33 :
34 : #include "../arrow_common/ograrrowwriterlayer.hpp"
35 :
36 : #include "ogr_wkb.h"
37 :
38 : #include <utility>
39 :
40 : /************************************************************************/
41 : /* OGRParquetWriterLayer() */
42 : /************************************************************************/
43 :
44 200 : OGRParquetWriterLayer::OGRParquetWriterLayer(
45 : OGRParquetWriterDataset *poDataset, arrow::MemoryPool *poMemoryPool,
46 : const std::shared_ptr<arrow::io::OutputStream> &poOutputStream,
47 200 : const char *pszLayerName)
48 : : OGRArrowWriterLayer(poMemoryPool, poOutputStream, pszLayerName),
49 200 : m_poDataset(poDataset)
50 : {
51 200 : m_bWriteFieldArrowExtensionName = CPLTestBool(
52 : CPLGetConfigOption("OGR_PARQUET_WRITE_ARROW_EXTENSION_NAME", "NO"));
53 200 : }
54 :
55 : /************************************************************************/
56 : /* Close() */
57 : /************************************************************************/
58 :
59 197 : bool OGRParquetWriterLayer::Close()
60 : {
61 197 : if (m_poTmpGPKGLayer)
62 : {
63 2 : if (!CopyTmpGpkgLayerToFinalFile())
64 0 : return false;
65 : }
66 :
67 197 : if (m_bInitializationOK)
68 : {
69 197 : if (!FinalizeWriting())
70 0 : return false;
71 : }
72 :
73 197 : return true;
74 : }
75 :
76 : /************************************************************************/
77 : /* CopyTmpGpkgLayerToFinalFile() */
78 : /************************************************************************/
79 :
80 2 : bool OGRParquetWriterLayer::CopyTmpGpkgLayerToFinalFile()
81 : {
82 2 : if (!m_poTmpGPKGLayer)
83 : {
84 0 : return true;
85 : }
86 :
87 2 : CPLDebug("PARQUET", "CopyTmpGpkgLayerToFinalFile(): start...");
88 :
89 2 : VSIUnlink(m_poTmpGPKG->GetDescription());
90 :
91 4 : OGRFeature oFeat(m_poFeatureDefn);
92 :
93 : // Interval in terms of features between 2 debug progress report messages
94 2 : constexpr int PROGRESS_FC_INTERVAL = 100 * 1000;
95 :
96 : // First, write features without geometries
97 : {
98 2 : auto poTmpLayer = std::unique_ptr<OGRLayer>(m_poTmpGPKG->ExecuteSQL(
99 : "SELECT serialized_feature FROM tmp WHERE fid NOT IN (SELECT id "
100 : "FROM rtree_tmp_geom)",
101 2 : nullptr, nullptr));
102 2 : if (!poTmpLayer)
103 0 : return false;
104 1004 : for (const auto &poSrcFeature : poTmpLayer.get())
105 : {
106 1002 : int nBytesFeature = 0;
107 : const GByte *pabyFeatureData =
108 1002 : poSrcFeature->GetFieldAsBinary(0, &nBytesFeature);
109 1002 : if (!oFeat.DeserializeFromBinary(pabyFeatureData, nBytesFeature))
110 : {
111 0 : CPLError(CE_Failure, CPLE_AppDefined,
112 : "Cannot deserialize feature");
113 0 : return false;
114 : }
115 1002 : if (OGRArrowWriterLayer::ICreateFeature(&oFeat) != OGRERR_NONE)
116 : {
117 0 : return false;
118 : }
119 :
120 1002 : if ((m_nFeatureCount % PROGRESS_FC_INTERVAL) == 0)
121 : {
122 0 : CPLDebugProgress(
123 : "PARQUET",
124 : "CopyTmpGpkgLayerToFinalFile(): %.02f%% progress",
125 0 : 100.0 * double(m_nFeatureCount) /
126 0 : double(m_nTmpFeatureCount));
127 : }
128 : }
129 :
130 2 : if (!FlushFeatures())
131 : {
132 0 : return false;
133 : }
134 : }
135 :
136 : // Now walk through the GPKG RTree for features with geometries
137 : // Cf https://github.com/sqlite/sqlite/blob/master/ext/rtree/rtree.c
138 : // for the description of the content of the rtree _node table
139 4 : std::vector<std::pair<int64_t, int>> aNodeNoDepthPair;
140 2 : int nTreeDepth = 0;
141 : // Queue the root node
142 : aNodeNoDepthPair.emplace_back(
143 2 : std::make_pair(/* nodeNo = */ 1, /* depth = */ 0));
144 2 : int nCountWrittenFeaturesSinceLastFlush = 0;
145 50 : while (!aNodeNoDepthPair.empty())
146 : {
147 48 : const auto &oLastPair = aNodeNoDepthPair.back();
148 48 : const int64_t nNodeNo = oLastPair.first;
149 48 : const int nCurDepth = oLastPair.second;
150 : //CPLDebug("PARQUET", "Reading nodeNode=%d, curDepth=%d", int(nNodeNo), nCurDepth);
151 48 : aNodeNoDepthPair.pop_back();
152 :
153 48 : auto poRTreeLayer = std::unique_ptr<OGRLayer>(m_poTmpGPKG->ExecuteSQL(
154 : CPLSPrintf("SELECT data FROM rtree_tmp_geom_node WHERE nodeno "
155 : "= " CPL_FRMT_GIB,
156 : static_cast<GIntBig>(nNodeNo)),
157 48 : nullptr, nullptr));
158 48 : if (!poRTreeLayer)
159 : {
160 0 : CPLError(CE_Failure, CPLE_AppDefined,
161 : "Cannot read node " CPL_FRMT_GIB,
162 : static_cast<GIntBig>(nNodeNo));
163 0 : return false;
164 : }
165 : const auto poRTreeFeature =
166 48 : std::unique_ptr<const OGRFeature>(poRTreeLayer->GetNextFeature());
167 48 : if (!poRTreeFeature)
168 : {
169 0 : CPLError(CE_Failure, CPLE_AppDefined,
170 : "Cannot read node " CPL_FRMT_GIB,
171 : static_cast<GIntBig>(nNodeNo));
172 0 : return false;
173 : }
174 :
175 48 : int nNodeBytes = 0;
176 : const GByte *pabyNodeData =
177 48 : poRTreeFeature->GetFieldAsBinary(0, &nNodeBytes);
178 48 : constexpr int BLOB_HEADER_SIZE = 4;
179 48 : if (nNodeBytes < BLOB_HEADER_SIZE)
180 : {
181 0 : CPLError(CE_Failure, CPLE_AppDefined,
182 : "Not enough bytes when reading node " CPL_FRMT_GIB,
183 : static_cast<GIntBig>(nNodeNo));
184 0 : return false;
185 : }
186 48 : if (nNodeNo == 1)
187 : {
188 : // Get the RTree depth from the root node
189 2 : nTreeDepth = (pabyNodeData[0] << 8) | pabyNodeData[1];
190 : //CPLDebug("PARQUET", "nTreeDepth = %d", nTreeDepth);
191 : }
192 :
193 48 : const int nCellCount = (pabyNodeData[2] << 8) | pabyNodeData[3];
194 48 : constexpr int SIZEOF_CELL = 24; // int64_t + 4 float
195 48 : if (nNodeBytes < BLOB_HEADER_SIZE + SIZEOF_CELL * nCellCount)
196 : {
197 0 : CPLError(CE_Failure, CPLE_AppDefined,
198 : "Not enough bytes when reading node " CPL_FRMT_GIB,
199 : static_cast<GIntBig>(nNodeNo));
200 0 : return false;
201 : }
202 :
203 48 : size_t nOffset = BLOB_HEADER_SIZE;
204 48 : if (nCurDepth == nTreeDepth)
205 : {
206 : // Leaf node: it references feature IDs.
207 :
208 : // If we are about to go above m_nRowGroupSize, flush past
209 : // features now, to improve the spatial compacity of the row group.
210 46 : if (m_nRowGroupSize > nCellCount &&
211 46 : nCountWrittenFeaturesSinceLastFlush + nCellCount >
212 46 : m_nRowGroupSize)
213 : {
214 14 : nCountWrittenFeaturesSinceLastFlush = 0;
215 14 : if (!FlushFeatures())
216 : {
217 0 : return false;
218 : }
219 : }
220 :
221 : // nCellCount shouldn't be over 51 normally, but even 65535
222 : // would be fine...
223 : // coverity[tainted_data]
224 1248 : for (int i = 0; i < nCellCount; ++i)
225 : {
226 : int64_t nFID;
227 1202 : memcpy(&nFID, pabyNodeData + nOffset, sizeof(int64_t));
228 1202 : CPL_MSBPTR64(&nFID);
229 :
230 : const auto poSrcFeature = std::unique_ptr<const OGRFeature>(
231 1202 : m_poTmpGPKGLayer->GetFeature(nFID));
232 1202 : if (!poSrcFeature)
233 : {
234 0 : CPLError(CE_Failure, CPLE_AppDefined,
235 : "Cannot get feature " CPL_FRMT_GIB,
236 : static_cast<GIntBig>(nFID));
237 0 : return false;
238 : }
239 :
240 1202 : int nBytesFeature = 0;
241 : const GByte *pabyFeatureData =
242 1202 : poSrcFeature->GetFieldAsBinary(0, &nBytesFeature);
243 1202 : if (!oFeat.DeserializeFromBinary(pabyFeatureData,
244 : nBytesFeature))
245 : {
246 0 : CPLError(CE_Failure, CPLE_AppDefined,
247 : "Cannot deserialize feature");
248 0 : return false;
249 : }
250 1202 : if (OGRArrowWriterLayer::ICreateFeature(&oFeat) != OGRERR_NONE)
251 : {
252 0 : return false;
253 : }
254 :
255 1202 : nOffset += SIZEOF_CELL;
256 :
257 1202 : ++nCountWrittenFeaturesSinceLastFlush;
258 :
259 1202 : if ((m_nFeatureCount % PROGRESS_FC_INTERVAL) == 0 ||
260 1202 : m_nFeatureCount == m_nTmpFeatureCount / 2)
261 : {
262 2 : CPLDebugProgress(
263 : "PARQUET",
264 : "CopyTmpGpkgLayerToFinalFile(): %.02f%% progress",
265 2 : 100.0 * double(m_nFeatureCount) /
266 2 : double(m_nTmpFeatureCount));
267 : }
268 : }
269 : }
270 : else
271 : {
272 : // Non-leaf node: it references child nodes.
273 :
274 : // nCellCount shouldn't be over 51 normally, but even 65535
275 : // would be fine...
276 : // coverity[tainted_data]
277 48 : for (int i = 0; i < nCellCount; ++i)
278 : {
279 : int64_t nNode;
280 46 : memcpy(&nNode, pabyNodeData + nOffset, sizeof(int64_t));
281 46 : CPL_MSBPTR64(&nNode);
282 : aNodeNoDepthPair.emplace_back(
283 46 : std::make_pair(nNode, nCurDepth + 1));
284 46 : nOffset += SIZEOF_CELL;
285 : }
286 : }
287 : }
288 :
289 2 : CPLDebug("PARQUET",
290 : "CopyTmpGpkgLayerToFinalFile(): 100%%, successfully finished");
291 2 : return true;
292 : }
293 :
294 : /************************************************************************/
295 : /* IsSupportedGeometryType() */
296 : /************************************************************************/
297 :
298 204 : bool OGRParquetWriterLayer::IsSupportedGeometryType(
299 : OGRwkbGeometryType eGType) const
300 : {
301 204 : const auto eFlattenType = wkbFlatten(eGType);
302 204 : if (!OGR_GT_HasM(eGType) && eFlattenType <= wkbGeometryCollection)
303 : {
304 203 : return true;
305 : }
306 :
307 : const auto osConfigOptionName =
308 3 : "OGR_" + GetDriverUCName() + "_ALLOW_ALL_DIMS";
309 1 : if (CPLTestBool(CPLGetConfigOption(osConfigOptionName.c_str(), "NO")))
310 : {
311 0 : return true;
312 : }
313 :
314 1 : CPLError(CE_Failure, CPLE_NotSupported,
315 : "Only 2D and Z geometry types are supported (unless the "
316 : "%s configuration option is set to YES)",
317 : osConfigOptionName.c_str());
318 1 : return false;
319 : }
320 :
321 : /************************************************************************/
322 : /* SetOptions() */
323 : /************************************************************************/
324 :
325 200 : bool OGRParquetWriterLayer::SetOptions(CSLConstList papszOptions,
326 : const OGRSpatialReference *poSpatialRef,
327 : OGRwkbGeometryType eGType)
328 : {
329 200 : m_bWriteBBoxStruct = CPLTestBool(CSLFetchNameValueDef(
330 : papszOptions, "WRITE_COVERING_BBOX",
331 : CPLGetConfigOption("OGR_PARQUET_WRITE_COVERING_BBOX", "YES")));
332 :
333 200 : if (CPLTestBool(CSLFetchNameValueDef(papszOptions, "SORT_BY_BBOX", "NO")))
334 : {
335 6 : const std::string osTmpGPKG(std::string(m_poDataset->GetDescription()) +
336 3 : ".tmp.gpkg");
337 3 : auto poGPKGDrv = GetGDALDriverManager()->GetDriverByName("GPKG");
338 3 : if (!poGPKGDrv)
339 : {
340 1 : CPLError(
341 : CE_Failure, CPLE_AppDefined,
342 : "Driver GPKG required for SORT_BY_BBOX layer creation option");
343 1 : return false;
344 : }
345 2 : m_poTmpGPKG.reset(poGPKGDrv->Create(osTmpGPKG.c_str(), 0, 0, 0,
346 : GDT_Unknown, nullptr));
347 2 : if (!m_poTmpGPKG)
348 0 : return false;
349 2 : m_poTmpGPKG->MarkSuppressOnClose();
350 2 : m_poTmpGPKGLayer = m_poTmpGPKG->CreateLayer("tmp");
351 2 : if (!m_poTmpGPKGLayer)
352 0 : return false;
353 : // Serialized feature
354 2 : m_poTmpGPKGLayer->CreateField(
355 2 : std::make_unique<OGRFieldDefn>("serialized_feature", OFTBinary)
356 2 : .get());
357 2 : CPL_IGNORE_RET_VAL(m_poTmpGPKGLayer->StartTransaction());
358 : }
359 :
360 : const char *pszGeomEncoding =
361 199 : CSLFetchNameValue(papszOptions, "GEOMETRY_ENCODING");
362 199 : m_eGeomEncoding = OGRArrowGeomEncoding::WKB;
363 199 : if (pszGeomEncoding)
364 : {
365 92 : if (EQUAL(pszGeomEncoding, "WKB"))
366 0 : m_eGeomEncoding = OGRArrowGeomEncoding::WKB;
367 92 : else if (EQUAL(pszGeomEncoding, "WKT"))
368 8 : m_eGeomEncoding = OGRArrowGeomEncoding::WKT;
369 84 : else if (EQUAL(pszGeomEncoding, "GEOARROW_INTERLEAVED"))
370 : {
371 : static bool bHasWarned = false;
372 28 : if (!bHasWarned)
373 : {
374 1 : bHasWarned = true;
375 1 : CPLError(
376 : CE_Warning, CPLE_AppDefined,
377 : "Use of GEOMETRY_ENCODING=GEOARROW_INTERLEAVED is not "
378 : "recommended. "
379 : "GeoParquet 1.1 uses GEOMETRY_ENCODING=GEOARROW (struct) "
380 : "instead.");
381 : }
382 28 : m_eGeomEncoding = OGRArrowGeomEncoding::GEOARROW_FSL_GENERIC;
383 : }
384 56 : else if (EQUAL(pszGeomEncoding, "GEOARROW") ||
385 0 : EQUAL(pszGeomEncoding, "GEOARROW_STRUCT"))
386 56 : m_eGeomEncoding = OGRArrowGeomEncoding::GEOARROW_STRUCT_GENERIC;
387 : else
388 : {
389 0 : CPLError(CE_Failure, CPLE_NotSupported,
390 : "Unsupported GEOMETRY_ENCODING = %s", pszGeomEncoding);
391 0 : return false;
392 : }
393 : }
394 :
395 : const char *pszCoordPrecision =
396 199 : CSLFetchNameValue(papszOptions, "COORDINATE_PRECISION");
397 199 : if (pszCoordPrecision)
398 0 : m_nWKTCoordinatePrecision = atoi(pszCoordPrecision);
399 :
400 199 : m_bForceCounterClockwiseOrientation =
401 199 : EQUAL(CSLFetchNameValueDef(papszOptions, "POLYGON_ORIENTATION",
402 : "COUNTERCLOCKWISE"),
403 : "COUNTERCLOCKWISE");
404 :
405 199 : if (eGType != wkbNone)
406 : {
407 178 : if (!IsSupportedGeometryType(eGType))
408 : {
409 1 : return false;
410 : }
411 :
412 177 : m_poFeatureDefn->SetGeomType(eGType);
413 177 : auto eGeomEncoding = m_eGeomEncoding;
414 177 : if (eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_FSL_GENERIC ||
415 149 : eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_STRUCT_GENERIC)
416 : {
417 84 : const auto eEncodingType = eGeomEncoding;
418 84 : eGeomEncoding = GetPreciseArrowGeomEncoding(eEncodingType, eGType);
419 84 : if (eGeomEncoding == eEncodingType)
420 0 : return false;
421 : }
422 177 : m_aeGeomEncoding.push_back(eGeomEncoding);
423 177 : m_poFeatureDefn->GetGeomFieldDefn(0)->SetName(
424 : CSLFetchNameValueDef(papszOptions, "GEOMETRY_NAME", "geometry"));
425 177 : if (poSpatialRef)
426 : {
427 20 : auto poSRS = poSpatialRef->Clone();
428 20 : m_poFeatureDefn->GetGeomFieldDefn(0)->SetSpatialRef(poSRS);
429 20 : poSRS->Release();
430 : }
431 : }
432 :
433 198 : m_osFIDColumn = CSLFetchNameValueDef(papszOptions, "FID", "");
434 :
435 198 : const char *pszCompression = CSLFetchNameValue(papszOptions, "COMPRESSION");
436 198 : if (pszCompression == nullptr)
437 : {
438 582 : auto oResult = arrow::util::Codec::GetCompressionType("snappy");
439 194 : if (oResult.ok() && arrow::util::Codec::IsAvailable(*oResult))
440 : {
441 194 : pszCompression = "SNAPPY";
442 : }
443 : else
444 : {
445 0 : pszCompression = "NONE";
446 : }
447 : }
448 :
449 198 : if (EQUAL(pszCompression, "NONE"))
450 0 : pszCompression = "UNCOMPRESSED";
451 : auto oResult = arrow::util::Codec::GetCompressionType(
452 396 : CPLString(pszCompression).tolower());
453 198 : if (!oResult.ok())
454 : {
455 1 : CPLError(CE_Failure, CPLE_NotSupported,
456 : "Unrecognized compression method: %s", pszCompression);
457 1 : return false;
458 : }
459 197 : m_eCompression = *oResult;
460 197 : if (!arrow::util::Codec::IsAvailable(m_eCompression))
461 : {
462 0 : CPLError(CE_Failure, CPLE_NotSupported,
463 : "Compression method %s is known, but libarrow has not "
464 : "been built with support for it",
465 : pszCompression);
466 0 : return false;
467 : }
468 :
469 197 : m_oWriterPropertiesBuilder.compression(m_eCompression);
470 : const std::string osCreator =
471 197 : CSLFetchNameValueDef(papszOptions, "CREATOR", "");
472 197 : if (!osCreator.empty())
473 1 : m_oWriterPropertiesBuilder.created_by(osCreator);
474 : else
475 196 : m_oWriterPropertiesBuilder.created_by("GDAL " GDAL_RELEASE_NAME
476 : ", using " CREATED_BY_VERSION);
477 :
478 : // Undocumented option. Not clear it is useful besides unit test purposes
479 197 : if (!CPLTestBool(CSLFetchNameValueDef(papszOptions, "STATISTICS", "YES")))
480 1 : m_oWriterPropertiesBuilder.disable_statistics();
481 :
482 197 : if (m_eGeomEncoding == OGRArrowGeomEncoding::WKB && eGType != wkbNone)
483 : {
484 92 : m_oWriterPropertiesBuilder.disable_statistics(
485 276 : parquet::schema::ColumnPath::FromDotString(
486 92 : m_poFeatureDefn->GetGeomFieldDefn(0)->GetNameRef()));
487 : }
488 :
489 : const char *pszRowGroupSize =
490 197 : CSLFetchNameValue(papszOptions, "ROW_GROUP_SIZE");
491 197 : if (pszRowGroupSize)
492 : {
493 5 : auto nRowGroupSize = static_cast<int64_t>(atoll(pszRowGroupSize));
494 5 : if (nRowGroupSize > 0)
495 : {
496 5 : if (nRowGroupSize > INT_MAX)
497 0 : nRowGroupSize = INT_MAX;
498 5 : m_nRowGroupSize = nRowGroupSize;
499 : }
500 : }
501 :
502 197 : m_bEdgesSpherical = EQUAL(
503 : CSLFetchNameValueDef(papszOptions, "EDGES", "PLANAR"), "SPHERICAL");
504 :
505 197 : m_bInitializationOK = true;
506 197 : return true;
507 : }
508 :
509 : /************************************************************************/
510 : /* CloseFileWriter() */
511 : /************************************************************************/
512 :
513 197 : bool OGRParquetWriterLayer::CloseFileWriter()
514 : {
515 394 : auto status = m_poFileWriter->Close();
516 197 : if (!status.ok())
517 : {
518 0 : CPLError(CE_Failure, CPLE_AppDefined,
519 : "FileWriter::Close() failed with %s",
520 0 : status.message().c_str());
521 : }
522 394 : return status.ok();
523 : }
524 :
525 : /************************************************************************/
526 : /* IdentifyCRS() */
527 : /************************************************************************/
528 :
529 19 : static OGRSpatialReference IdentifyCRS(const OGRSpatialReference *poSRS)
530 : {
531 19 : OGRSpatialReference oSRSIdentified(*poSRS);
532 :
533 19 : if (poSRS->GetAuthorityName(nullptr) == nullptr)
534 : {
535 : // Try to find a registered CRS that matches the input one
536 4 : int nEntries = 0;
537 4 : int *panConfidence = nullptr;
538 : OGRSpatialReferenceH *pahSRS =
539 4 : poSRS->FindMatches(nullptr, &nEntries, &panConfidence);
540 :
541 : // If there are several matches >= 90%, take the only one
542 : // that is EPSG
543 4 : int iOtherAuthority = -1;
544 4 : int iEPSG = -1;
545 4 : const char *const apszOptions[] = {
546 : "IGNORE_DATA_AXIS_TO_SRS_AXIS_MAPPING=YES", nullptr};
547 4 : int iConfidenceBestMatch = -1;
548 6 : for (int iSRS = 0; iSRS < nEntries; iSRS++)
549 : {
550 4 : auto poCandidateCRS = OGRSpatialReference::FromHandle(pahSRS[iSRS]);
551 4 : if (panConfidence[iSRS] < iConfidenceBestMatch ||
552 4 : panConfidence[iSRS] < 70)
553 : {
554 : break;
555 : }
556 3 : if (poSRS->IsSame(poCandidateCRS, apszOptions))
557 : {
558 : const char *pszAuthName =
559 3 : poCandidateCRS->GetAuthorityName(nullptr);
560 3 : if (pszAuthName != nullptr && EQUAL(pszAuthName, "EPSG"))
561 : {
562 2 : iOtherAuthority = -2;
563 2 : if (iEPSG < 0)
564 : {
565 2 : iConfidenceBestMatch = panConfidence[iSRS];
566 2 : iEPSG = iSRS;
567 : }
568 : else
569 : {
570 0 : iEPSG = -1;
571 0 : break;
572 : }
573 : }
574 1 : else if (iEPSG < 0 && pszAuthName != nullptr)
575 : {
576 1 : if (EQUAL(pszAuthName, "OGC"))
577 : {
578 : const char *pszAuthCode =
579 1 : poCandidateCRS->GetAuthorityCode(nullptr);
580 1 : if (pszAuthCode && EQUAL(pszAuthCode, "CRS84"))
581 : {
582 1 : iOtherAuthority = iSRS;
583 1 : break;
584 : }
585 : }
586 0 : else if (iOtherAuthority == -1)
587 : {
588 0 : iConfidenceBestMatch = panConfidence[iSRS];
589 0 : iOtherAuthority = iSRS;
590 : }
591 : else
592 0 : iOtherAuthority = -2;
593 : }
594 : }
595 : }
596 4 : if (iEPSG >= 0)
597 : {
598 2 : oSRSIdentified = *OGRSpatialReference::FromHandle(pahSRS[iEPSG]);
599 : }
600 2 : else if (iOtherAuthority >= 0)
601 : {
602 : oSRSIdentified =
603 1 : *OGRSpatialReference::FromHandle(pahSRS[iOtherAuthority]);
604 : }
605 4 : OSRFreeSRSArray(pahSRS);
606 4 : CPLFree(panConfidence);
607 : }
608 :
609 19 : return oSRSIdentified;
610 : }
611 :
612 : /************************************************************************/
613 : /* RemoveIDFromMemberOfEnsembles() */
614 : /************************************************************************/
615 :
616 236 : static void RemoveIDFromMemberOfEnsembles(CPLJSONObject &obj)
617 : {
618 : // Remove "id" from members of datum ensembles for compatibility with
619 : // older PROJ versions
620 : // Cf https://github.com/opengeospatial/geoparquet/discussions/110
621 : // and https://github.com/OSGeo/PROJ/pull/3221
622 236 : if (obj.GetType() == CPLJSONObject::Type::Object)
623 : {
624 298 : for (auto &subObj : obj.GetChildren())
625 : {
626 228 : RemoveIDFromMemberOfEnsembles(subObj);
627 : }
628 : }
629 182 : else if (obj.GetType() == CPLJSONObject::Type::Array &&
630 182 : obj.GetName() == "members")
631 : {
632 0 : for (auto &subObj : obj.ToArray())
633 : {
634 0 : subObj.Delete("id");
635 : }
636 : }
637 236 : }
638 :
639 : /************************************************************************/
640 : /* GetGeoMetadata() */
641 : /************************************************************************/
642 :
643 197 : std::string OGRParquetWriterLayer::GetGeoMetadata() const
644 : {
645 : // Just for unit testing purposes
646 : const char *pszGeoMetadata =
647 197 : CPLGetConfigOption("OGR_PARQUET_GEO_METADATA", nullptr);
648 197 : if (pszGeoMetadata)
649 16 : return pszGeoMetadata;
650 :
651 351 : if (m_poFeatureDefn->GetGeomFieldCount() != 0 &&
652 170 : CPLTestBool(CPLGetConfigOption("OGR_PARQUET_WRITE_GEO", "YES")))
653 : {
654 338 : CPLJSONObject oRoot;
655 169 : oRoot.Add("version",
656 169 : m_eGeomEncoding ==
657 : OGRArrowGeomEncoding::GEOARROW_STRUCT_GENERIC
658 : ? "1.1.0"
659 : : "1.0.0");
660 169 : oRoot.Add("primary_column",
661 169 : m_poFeatureDefn->GetGeomFieldDefn(0)->GetNameRef());
662 338 : CPLJSONObject oColumns;
663 169 : oRoot.Add("columns", oColumns);
664 355 : for (int i = 0; i < m_poFeatureDefn->GetGeomFieldCount(); ++i)
665 : {
666 186 : const auto poGeomFieldDefn = m_poFeatureDefn->GetGeomFieldDefn(i);
667 372 : CPLJSONObject oColumn;
668 186 : oColumns.Add(poGeomFieldDefn->GetNameRef(), oColumn);
669 186 : oColumn.Add("encoding",
670 186 : GetGeomEncodingAsString(m_aeGeomEncoding[i], true));
671 :
672 186 : if (CPLTestBool(CPLGetConfigOption("OGR_PARQUET_WRITE_CRS", "YES")))
673 : {
674 185 : const auto poSRS = poGeomFieldDefn->GetSpatialRef();
675 185 : if (poSRS)
676 : {
677 38 : OGRSpatialReference oSRSIdentified(IdentifyCRS(poSRS));
678 :
679 : const char *pszAuthName =
680 19 : oSRSIdentified.GetAuthorityName(nullptr);
681 : const char *pszAuthCode =
682 19 : oSRSIdentified.GetAuthorityCode(nullptr);
683 :
684 19 : bool bOmitCRS = false;
685 19 : if (pszAuthName != nullptr && pszAuthCode != nullptr &&
686 18 : ((EQUAL(pszAuthName, "EPSG") &&
687 15 : EQUAL(pszAuthCode, "4326")) ||
688 10 : (EQUAL(pszAuthName, "OGC") &&
689 3 : EQUAL(pszAuthCode, "CRS84"))))
690 : {
691 : // To make things less confusing for non-geo-aware
692 : // consumers, omit EPSG:4326 / OGC:CRS84 CRS by default
693 11 : bOmitCRS = CPLTestBool(CPLGetConfigOption(
694 : "OGR_PARQUET_CRS_OMIT_IF_WGS84", "YES"));
695 : }
696 :
697 19 : if (bOmitCRS)
698 : {
699 : // do nothing
700 : }
701 8 : else if (EQUAL(CPLGetConfigOption(
702 : "OGR_PARQUET_CRS_ENCODING", "PROJJSON"),
703 : "PROJJSON"))
704 : {
705 : // CRS encoded as PROJJSON for GeoParquet >= 0.4.0
706 8 : char *pszPROJJSON = nullptr;
707 8 : oSRSIdentified.exportToPROJJSON(&pszPROJJSON, nullptr);
708 16 : CPLJSONDocument oCRSDoc;
709 8 : CPL_IGNORE_RET_VAL(oCRSDoc.LoadMemory(pszPROJJSON));
710 8 : CPLFree(pszPROJJSON);
711 8 : CPLJSONObject oCRSRoot = oCRSDoc.GetRoot();
712 8 : RemoveIDFromMemberOfEnsembles(oCRSRoot);
713 8 : oColumn.Add("crs", oCRSRoot);
714 : }
715 : else
716 : {
717 : // WKT was used in GeoParquet <= 0.3.0
718 0 : const char *const apszOptions[] = {
719 : "FORMAT=WKT2_2019", "MULTILINE=NO", nullptr};
720 0 : char *pszWKT = nullptr;
721 0 : oSRSIdentified.exportToWkt(&pszWKT, apszOptions);
722 0 : if (pszWKT)
723 0 : oColumn.Add("crs", pszWKT);
724 0 : CPLFree(pszWKT);
725 : }
726 :
727 19 : const double dfCoordEpoch = poSRS->GetCoordinateEpoch();
728 19 : if (dfCoordEpoch > 0)
729 2 : oColumn.Add("epoch", dfCoordEpoch);
730 : }
731 : else
732 : {
733 166 : oColumn.AddNull("crs");
734 : }
735 : }
736 :
737 186 : if (m_bEdgesSpherical)
738 : {
739 1 : oColumn.Add("edges", "spherical");
740 : }
741 :
742 348 : if (m_aoEnvelopes[i].IsInit() &&
743 162 : CPLTestBool(
744 : CPLGetConfigOption("OGR_PARQUET_WRITE_BBOX", "YES")))
745 : {
746 162 : bool bHasZ = false;
747 307 : for (const auto eGeomType : m_oSetWrittenGeometryTypes[i])
748 : {
749 204 : bHasZ = OGR_GT_HasZ(eGeomType);
750 204 : if (bHasZ)
751 59 : break;
752 : }
753 162 : CPLJSONArray oBBOX;
754 162 : oBBOX.Add(m_aoEnvelopes[i].MinX);
755 162 : oBBOX.Add(m_aoEnvelopes[i].MinY);
756 162 : if (bHasZ)
757 59 : oBBOX.Add(m_aoEnvelopes[i].MinZ);
758 162 : oBBOX.Add(m_aoEnvelopes[i].MaxX);
759 162 : oBBOX.Add(m_aoEnvelopes[i].MaxY);
760 162 : if (bHasZ)
761 59 : oBBOX.Add(m_aoEnvelopes[i].MaxZ);
762 162 : oColumn.Add("bbox", oBBOX);
763 : }
764 :
765 : // Bounding box column definition
766 330 : if (m_bWriteBBoxStruct &&
767 144 : CPLTestBool(CPLGetConfigOption(
768 : "OGR_PARQUET_WRITE_COVERING_BBOX_IN_METADATA", "YES")))
769 : {
770 288 : CPLJSONObject oCovering;
771 144 : oColumn.Add("covering", oCovering);
772 288 : CPLJSONObject oBBOX;
773 144 : oCovering.Add("bbox", oBBOX);
774 : const auto AddComponent =
775 1728 : [this, i, &oBBOX](const char *pszComponent)
776 : {
777 576 : CPLJSONArray oArray;
778 576 : oArray.Add(m_apoFieldsBBOX[i]->name());
779 576 : oArray.Add(pszComponent);
780 576 : oBBOX.Add(pszComponent, oArray);
781 576 : };
782 144 : AddComponent("xmin");
783 144 : AddComponent("ymin");
784 144 : AddComponent("xmax");
785 144 : AddComponent("ymax");
786 : }
787 :
788 220 : const auto GetStringGeometryType = [](OGRwkbGeometryType eType)
789 : {
790 220 : const auto eFlattenType = wkbFlatten(eType);
791 220 : std::string osType = "Unknown";
792 220 : if (wkbPoint == eFlattenType)
793 53 : osType = "Point";
794 167 : else if (wkbLineString == eFlattenType)
795 26 : osType = "LineString";
796 141 : else if (wkbPolygon == eFlattenType)
797 40 : osType = "Polygon";
798 101 : else if (wkbMultiPoint == eFlattenType)
799 18 : osType = "MultiPoint";
800 83 : else if (wkbMultiLineString == eFlattenType)
801 21 : osType = "MultiLineString";
802 62 : else if (wkbMultiPolygon == eFlattenType)
803 57 : osType = "MultiPolygon";
804 5 : else if (wkbGeometryCollection == eFlattenType)
805 5 : osType = "GeometryCollection";
806 220 : if (osType != "Unknown")
807 : {
808 : // M and ZM not supported officially currently, but it
809 : // doesn't hurt to anticipate
810 220 : if (OGR_GT_HasZ(eType) && OGR_GT_HasM(eType))
811 8 : osType += " ZM";
812 212 : else if (OGR_GT_HasZ(eType))
813 67 : osType += " Z";
814 145 : else if (OGR_GT_HasM(eType))
815 8 : osType += " M";
816 : }
817 220 : return osType;
818 : };
819 :
820 186 : if (m_bForceCounterClockwiseOrientation)
821 185 : oColumn.Add("orientation", "counterclockwise");
822 :
823 186 : CPLJSONArray oArray;
824 406 : for (const auto eType : m_oSetWrittenGeometryTypes[i])
825 : {
826 220 : oArray.Add(GetStringGeometryType(eType));
827 : }
828 186 : oColumn.Add("geometry_types", oArray);
829 : }
830 :
831 169 : return oRoot.Format(CPLJSONObject::PrettyFormat::Plain);
832 : }
833 12 : return std::string();
834 : }
835 :
836 : /************************************************************************/
837 : /* PerformStepsBeforeFinalFlushGroup() */
838 : /************************************************************************/
839 :
840 197 : void OGRParquetWriterLayer::PerformStepsBeforeFinalFlushGroup()
841 : {
842 197 : if (m_poKeyValueMetadata)
843 : {
844 394 : const std::string osGeoMetadata = GetGeoMetadata();
845 394 : auto poTmpSchema = m_poSchema;
846 197 : if (!osGeoMetadata.empty())
847 : {
848 : // HACK: it would be good for Arrow to provide a clean way to alter
849 : // key value metadata before finalizing.
850 : // We need to write metadata at end to write the bounding box.
851 185 : const_cast<arrow::KeyValueMetadata *>(m_poKeyValueMetadata.get())
852 185 : ->Append("geo", osGeoMetadata);
853 :
854 185 : auto kvMetadata = poTmpSchema->metadata()
855 8 : ? poTmpSchema->metadata()->Copy()
856 193 : : std::make_shared<arrow::KeyValueMetadata>();
857 185 : kvMetadata->Append("geo", osGeoMetadata);
858 185 : poTmpSchema = poTmpSchema->WithMetadata(kvMetadata);
859 : }
860 :
861 197 : if (CPLTestBool(
862 : CPLGetConfigOption("OGR_PARQUET_WRITE_ARROW_SCHEMA", "YES")))
863 : {
864 : auto status =
865 394 : ::arrow::ipc::SerializeSchema(*poTmpSchema, m_poMemoryPool);
866 197 : if (status.ok())
867 : {
868 : // The serialized schema is not UTF-8, which is required for
869 : // Thrift
870 394 : const std::string schema_as_string = (*status)->ToString();
871 : const std::string schema_base64 =
872 197 : ::arrow::util::base64_encode(schema_as_string);
873 197 : static const std::string kArrowSchemaKey = "ARROW:schema";
874 : const_cast<arrow::KeyValueMetadata *>(
875 197 : m_poKeyValueMetadata.get())
876 197 : ->Append(kArrowSchemaKey, schema_base64);
877 : }
878 : }
879 :
880 : // Put GDAL metadata into a gdal:metadata domain
881 394 : CPLJSONObject oMultiMetadata;
882 197 : bool bHasMultiMetadata = false;
883 200 : auto &l_oMDMD = oMDMD.GetDomainList() && *(oMDMD.GetDomainList())
884 200 : ? oMDMD
885 194 : : m_poDataset->GetMultiDomainMetadata();
886 202 : for (CSLConstList papszDomainIter = l_oMDMD.GetDomainList();
887 202 : papszDomainIter && *papszDomainIter; ++papszDomainIter)
888 : {
889 5 : const char *pszDomain = *papszDomainIter;
890 5 : CSLConstList papszMD = l_oMDMD.GetMetadata(pszDomain);
891 5 : if (STARTS_WITH(pszDomain, "json:") && papszMD && papszMD[0])
892 : {
893 1 : CPLJSONDocument oDoc;
894 1 : if (oDoc.LoadMemory(papszMD[0]))
895 : {
896 1 : bHasMultiMetadata = true;
897 1 : oMultiMetadata.Add(pszDomain, oDoc.GetRoot());
898 1 : continue;
899 0 : }
900 : }
901 4 : else if (STARTS_WITH(pszDomain, "xml:") && papszMD && papszMD[0])
902 : {
903 1 : bHasMultiMetadata = true;
904 1 : oMultiMetadata.Add(pszDomain, papszMD[0]);
905 1 : continue;
906 : }
907 6 : CPLJSONObject oMetadata;
908 3 : bool bHasMetadata = false;
909 6 : for (CSLConstList papszMDIter = papszMD;
910 6 : papszMDIter && *papszMDIter; ++papszMDIter)
911 : {
912 3 : char *pszKey = nullptr;
913 3 : const char *pszValue = CPLParseNameValue(*papszMDIter, &pszKey);
914 3 : if (pszKey && pszValue)
915 : {
916 3 : bHasMetadata = true;
917 3 : bHasMultiMetadata = true;
918 3 : oMetadata.Add(pszKey, pszValue);
919 : }
920 3 : CPLFree(pszKey);
921 : }
922 3 : if (bHasMetadata)
923 3 : oMultiMetadata.Add(pszDomain, oMetadata);
924 : }
925 197 : if (bHasMultiMetadata)
926 : {
927 3 : const_cast<arrow::KeyValueMetadata *>(m_poKeyValueMetadata.get())
928 3 : ->Append(
929 : "gdal:metadata",
930 6 : oMultiMetadata.Format(CPLJSONObject::PrettyFormat::Plain));
931 : }
932 : }
933 197 : }
934 :
935 : /************************************************************************/
936 : /* Open() */
937 : /************************************************************************/
938 :
939 : // Same as parquet::arrow::FileWriter::Open(), except we also
940 : // return KeyValueMetadata
941 : static arrow::Status
942 197 : Open(const ::arrow::Schema &schema, ::arrow::MemoryPool *pool,
943 : std::shared_ptr<::arrow::io::OutputStream> sink,
944 : std::shared_ptr<parquet::WriterProperties> properties,
945 : std::shared_ptr<parquet::ArrowWriterProperties> arrow_properties,
946 : std::unique_ptr<parquet::arrow::FileWriter> *writer,
947 : std::shared_ptr<const arrow::KeyValueMetadata> *outMetadata)
948 : {
949 197 : std::shared_ptr<parquet::SchemaDescriptor> parquet_schema;
950 394 : RETURN_NOT_OK(parquet::arrow::ToParquetSchema(
951 : &schema, *properties, *arrow_properties, &parquet_schema));
952 :
953 : auto schema_node = std::static_pointer_cast<parquet::schema::GroupNode>(
954 394 : parquet_schema->schema_root());
955 :
956 197 : auto metadata = schema.metadata()
957 13 : ? schema.metadata()->Copy()
958 407 : : std::make_shared<arrow::KeyValueMetadata>();
959 197 : *outMetadata = metadata;
960 :
961 197 : std::unique_ptr<parquet::ParquetFileWriter> base_writer;
962 197 : PARQUET_CATCH_NOT_OK(base_writer = parquet::ParquetFileWriter::Open(
963 : std::move(sink), std::move(schema_node),
964 : std::move(properties), metadata));
965 :
966 197 : auto schema_ptr = std::make_shared<::arrow::Schema>(schema);
967 : return parquet::arrow::FileWriter::Make(
968 394 : pool, std::move(base_writer), std::move(schema_ptr),
969 591 : std::move(arrow_properties), writer);
970 : }
971 :
972 : /************************************************************************/
973 : /* CreateSchema() */
974 : /************************************************************************/
975 :
976 197 : void OGRParquetWriterLayer::CreateSchema()
977 : {
978 197 : CreateSchemaCommon();
979 197 : }
980 :
981 : /************************************************************************/
982 : /* CreateGeomField() */
983 : /************************************************************************/
984 :
985 27 : OGRErr OGRParquetWriterLayer::CreateGeomField(const OGRGeomFieldDefn *poField,
986 : int bApproxOK)
987 : {
988 27 : OGRErr eErr = OGRArrowWriterLayer::CreateGeomField(poField, bApproxOK);
989 53 : if (eErr == OGRERR_NONE &&
990 26 : m_aeGeomEncoding.back() == OGRArrowGeomEncoding::WKB)
991 : {
992 2 : m_oWriterPropertiesBuilder.disable_statistics(
993 6 : parquet::schema::ColumnPath::FromDotString(
994 2 : m_poFeatureDefn
995 2 : ->GetGeomFieldDefn(m_poFeatureDefn->GetGeomFieldCount() - 1)
996 : ->GetNameRef()));
997 : }
998 27 : return eErr;
999 : }
1000 :
1001 : /************************************************************************/
1002 : /* CreateWriter() */
1003 : /************************************************************************/
1004 :
1005 197 : void OGRParquetWriterLayer::CreateWriter()
1006 : {
1007 197 : CPLAssert(m_poFileWriter == nullptr);
1008 :
1009 197 : if (m_poSchema == nullptr)
1010 : {
1011 38 : CreateSchema();
1012 : }
1013 : else
1014 : {
1015 159 : FinalizeSchema();
1016 : }
1017 :
1018 : auto arrowWriterProperties =
1019 197 : parquet::ArrowWriterProperties::Builder().store_schema()->build();
1020 591 : CPL_IGNORE_RET_VAL(Open(*m_poSchema, m_poMemoryPool, m_poOutputStream,
1021 394 : m_oWriterPropertiesBuilder.build(),
1022 197 : std::move(arrowWriterProperties), &m_poFileWriter,
1023 : &m_poKeyValueMetadata));
1024 197 : }
1025 :
1026 : /************************************************************************/
1027 : /* ICreateFeature() */
1028 : /************************************************************************/
1029 :
1030 2806 : OGRErr OGRParquetWriterLayer::ICreateFeature(OGRFeature *poFeature)
1031 : {
1032 : // If not using SORT_BY_BBOX=YES layer creation option, we can directly
1033 : // write features to the final Parquet file
1034 2806 : if (!m_poTmpGPKGLayer)
1035 602 : return OGRArrowWriterLayer::ICreateFeature(poFeature);
1036 :
1037 : // SORT_BY_BBOX=YES case: we write for now a serialized version of poFeature
1038 : // in a temporary GeoPackage file.
1039 :
1040 2204 : GIntBig nFID = poFeature->GetFID();
1041 2204 : if (!m_osFIDColumn.empty() && nFID == OGRNullFID)
1042 : {
1043 1102 : nFID = m_nTmpFeatureCount;
1044 1102 : poFeature->SetFID(nFID);
1045 : }
1046 2204 : ++m_nTmpFeatureCount;
1047 :
1048 4408 : std::vector<GByte> abyBuffer;
1049 : // Serialize the source feature as a single array of bytes to preserve it
1050 : // fully
1051 2204 : if (!poFeature->SerializeToBinary(abyBuffer))
1052 : {
1053 0 : return OGRERR_FAILURE;
1054 : }
1055 :
1056 : // SQLite3 limitation: a row must fit in slightly less than 1 GB.
1057 2204 : constexpr int SOME_MARGIN = 128;
1058 2204 : if (abyBuffer.size() > 1024 * 1024 * 1024 - SOME_MARGIN)
1059 : {
1060 0 : CPLError(CE_Failure, CPLE_NotSupported,
1061 : "Features larger than 1 GB are not supported");
1062 0 : return OGRERR_FAILURE;
1063 : }
1064 :
1065 4408 : OGRFeature oFeat(m_poTmpGPKGLayer->GetLayerDefn());
1066 2204 : oFeat.SetFID(nFID);
1067 2204 : oFeat.SetField(0, static_cast<int>(abyBuffer.size()), abyBuffer.data());
1068 2204 : const auto poSrcGeom = poFeature->GetGeometryRef();
1069 2204 : if (poSrcGeom && !poSrcGeom->IsEmpty())
1070 : {
1071 : // For the purpose of building an RTree, just use the bounding box of
1072 : // the geometry as the geometry.
1073 1202 : OGREnvelope sEnvelope;
1074 1202 : poSrcGeom->getEnvelope(&sEnvelope);
1075 2404 : auto poPoly = std::make_unique<OGRPolygon>();
1076 2404 : auto poLR = std::make_unique<OGRLinearRing>();
1077 1202 : poLR->addPoint(sEnvelope.MinX, sEnvelope.MinY);
1078 1202 : poLR->addPoint(sEnvelope.MinX, sEnvelope.MaxY);
1079 1202 : poLR->addPoint(sEnvelope.MaxX, sEnvelope.MaxY);
1080 1202 : poLR->addPoint(sEnvelope.MaxX, sEnvelope.MinY);
1081 1202 : poLR->addPoint(sEnvelope.MinX, sEnvelope.MinY);
1082 1202 : poPoly->addRingDirectly(poLR.release());
1083 1202 : oFeat.SetGeometryDirectly(poPoly.release());
1084 : }
1085 2204 : return m_poTmpGPKGLayer->CreateFeature(&oFeat);
1086 : }
1087 :
1088 : /************************************************************************/
1089 : /* FlushGroup() */
1090 : /************************************************************************/
1091 :
1092 186 : bool OGRParquetWriterLayer::FlushGroup()
1093 : {
1094 372 : auto status = m_poFileWriter->NewRowGroup(m_apoBuilders[0]->length());
1095 186 : if (!status.ok())
1096 : {
1097 0 : CPLError(CE_Failure, CPLE_AppDefined, "NewRowGroup() failed with %s",
1098 0 : status.message().c_str());
1099 0 : ClearArrayBuilers();
1100 0 : return false;
1101 : }
1102 :
1103 186 : auto ret = WriteArrays(
1104 845 : [this](const std::shared_ptr<arrow::Field> &field,
1105 845 : const std::shared_ptr<arrow::Array> &array)
1106 : {
1107 1690 : auto l_status = m_poFileWriter->WriteColumnChunk(*array);
1108 845 : if (!l_status.ok())
1109 : {
1110 0 : CPLError(CE_Failure, CPLE_AppDefined,
1111 : "WriteColumnChunk() failed for field %s: %s",
1112 0 : field->name().c_str(), l_status.message().c_str());
1113 0 : return false;
1114 : }
1115 845 : return true;
1116 : });
1117 :
1118 186 : ClearArrayBuilers();
1119 186 : return ret;
1120 : }
1121 :
1122 : /************************************************************************/
1123 : /* FixupWKBGeometryBeforeWriting() */
1124 : /************************************************************************/
1125 :
1126 25 : void OGRParquetWriterLayer::FixupWKBGeometryBeforeWriting(GByte *pabyWkb,
1127 : size_t nLen)
1128 : {
1129 25 : if (!m_bForceCounterClockwiseOrientation)
1130 0 : return;
1131 :
1132 25 : OGRWKBFixupCounterClockWiseExternalRing(pabyWkb, nLen);
1133 : }
1134 :
1135 : /************************************************************************/
1136 : /* FixupGeometryBeforeWriting() */
1137 : /************************************************************************/
1138 :
1139 1329 : void OGRParquetWriterLayer::FixupGeometryBeforeWriting(OGRGeometry *poGeom)
1140 : {
1141 1329 : if (!m_bForceCounterClockwiseOrientation)
1142 3 : return;
1143 :
1144 1326 : const auto eFlattenType = wkbFlatten(poGeom->getGeometryType());
1145 : // Polygon rings MUST follow the right-hand rule for orientation
1146 : // (counterclockwise external rings, clockwise internal rings)
1147 1326 : if (eFlattenType == wkbPolygon)
1148 : {
1149 44 : bool bFirstRing = true;
1150 91 : for (auto poRing : poGeom->toPolygon())
1151 : {
1152 55 : if ((bFirstRing && poRing->isClockwise()) ||
1153 8 : (!bFirstRing && !poRing->isClockwise()))
1154 : {
1155 42 : poRing->reverseWindingOrder();
1156 : }
1157 47 : bFirstRing = false;
1158 : }
1159 : }
1160 1282 : else if (eFlattenType == wkbMultiPolygon ||
1161 : eFlattenType == wkbGeometryCollection)
1162 : {
1163 35 : for (auto poSubGeom : poGeom->toGeometryCollection())
1164 : {
1165 21 : FixupGeometryBeforeWriting(poSubGeom);
1166 : }
1167 : }
1168 : }
1169 :
1170 : /************************************************************************/
1171 : /* WriteArrowBatch() */
1172 : /************************************************************************/
1173 :
1174 : #if PARQUET_VERSION_MAJOR > 10
1175 : inline bool
1176 9 : OGRParquetWriterLayer::WriteArrowBatch(const struct ArrowSchema *schema,
1177 : struct ArrowArray *array,
1178 : CSLConstList papszOptions)
1179 : {
1180 9 : if (m_poTmpGPKGLayer)
1181 : {
1182 : // When using SORT_BY_BBOX=YES option, we can't directly write the
1183 : // input array, because we need to sort features. Hence we fallback
1184 : // to the OGRLayer base implementation, which will ultimately call
1185 : // OGRParquetWriterLayer::ICreateFeature()
1186 0 : return OGRLayer::WriteArrowBatch(schema, array, papszOptions);
1187 : }
1188 :
1189 18 : return WriteArrowBatchInternal(
1190 : schema, array, papszOptions,
1191 18 : [this](const std::shared_ptr<arrow::RecordBatch> &poBatch)
1192 : {
1193 18 : auto status = m_poFileWriter->NewBufferedRowGroup();
1194 9 : if (!status.ok())
1195 : {
1196 0 : CPLError(CE_Failure, CPLE_AppDefined,
1197 : "NewBufferedRowGroup() failed with %s",
1198 0 : status.message().c_str());
1199 0 : return false;
1200 : }
1201 :
1202 9 : status = m_poFileWriter->WriteRecordBatch(*poBatch);
1203 9 : if (!status.ok())
1204 : {
1205 0 : CPLError(CE_Failure, CPLE_AppDefined,
1206 : "WriteRecordBatch() failed: %s",
1207 0 : status.message().c_str());
1208 0 : return false;
1209 : }
1210 :
1211 9 : return true;
1212 9 : });
1213 : }
1214 : #endif
1215 :
1216 : /************************************************************************/
1217 : /* TestCapability() */
1218 : /************************************************************************/
1219 :
1220 352 : inline int OGRParquetWriterLayer::TestCapability(const char *pszCap)
1221 : {
1222 : #if PARQUET_VERSION_MAJOR <= 10
1223 : if (EQUAL(pszCap, OLCFastWriteArrowBatch))
1224 : return false;
1225 : #endif
1226 :
1227 352 : if (m_poTmpGPKGLayer && EQUAL(pszCap, OLCFastWriteArrowBatch))
1228 : {
1229 : // When using SORT_BY_BBOX=YES option, we can't directly write the
1230 : // input array, because we need to sort features. So this is not
1231 : // fast
1232 1 : return false;
1233 : }
1234 :
1235 351 : return OGRArrowWriterLayer::TestCapability(pszCap);
1236 : }
1237 :
1238 : /************************************************************************/
1239 : /* CreateFieldFromArrowSchema() */
1240 : /************************************************************************/
1241 :
1242 : #if PARQUET_VERSION_MAJOR > 10
1243 237 : bool OGRParquetWriterLayer::CreateFieldFromArrowSchema(
1244 : const struct ArrowSchema *schema, CSLConstList papszOptions)
1245 : {
1246 237 : if (m_poTmpGPKGLayer)
1247 : {
1248 : // When using SORT_BY_BBOX=YES option, we can't directly write the
1249 : // input array, because we need to sort features. But this process
1250 : // only supports the base Arrow types supported by
1251 : // OGRLayer::WriteArrowBatch()
1252 0 : return OGRLayer::CreateFieldFromArrowSchema(schema, papszOptions);
1253 : }
1254 :
1255 237 : return OGRArrowWriterLayer::CreateFieldFromArrowSchema(schema,
1256 237 : papszOptions);
1257 : }
1258 : #endif
1259 :
1260 : /************************************************************************/
1261 : /* IsArrowSchemaSupported() */
1262 : /************************************************************************/
1263 :
1264 : #if PARQUET_VERSION_MAJOR > 10
1265 716 : bool OGRParquetWriterLayer::IsArrowSchemaSupported(
1266 : const struct ArrowSchema *schema, CSLConstList papszOptions,
1267 : std::string &osErrorMsg) const
1268 : {
1269 716 : if (m_poTmpGPKGLayer)
1270 : {
1271 : // When using SORT_BY_BBOX=YES option, we can't directly write the
1272 : // input array, because we need to sort features. But this process
1273 : // only supports the base Arrow types supported by
1274 : // OGRLayer::WriteArrowBatch()
1275 0 : return OGRLayer::IsArrowSchemaSupported(schema, papszOptions,
1276 0 : osErrorMsg);
1277 : }
1278 :
1279 716 : if (schema->format[0] == 'e' && schema->format[1] == 0)
1280 : {
1281 1 : osErrorMsg = "float16 not supported";
1282 1 : return false;
1283 : }
1284 1423 : for (int64_t i = 0; i < schema->n_children; ++i)
1285 : {
1286 709 : if (!IsArrowSchemaSupported(schema->children[i], papszOptions,
1287 : osErrorMsg))
1288 : {
1289 1 : return false;
1290 : }
1291 : }
1292 714 : return true;
1293 : }
1294 : #endif
1295 :
1296 : /************************************************************************/
1297 : /* SetMetadata() */
1298 : /************************************************************************/
1299 :
1300 6 : CPLErr OGRParquetWriterLayer::SetMetadata(char **papszMetadata,
1301 : const char *pszDomain)
1302 : {
1303 6 : if (!pszDomain || !EQUAL(pszDomain, "SHAPEFILE"))
1304 : {
1305 4 : return OGRLayer::SetMetadata(papszMetadata, pszDomain);
1306 : }
1307 2 : return CE_None;
1308 : }
1309 :
1310 : /************************************************************************/
1311 : /* GetDataset() */
1312 : /************************************************************************/
1313 :
1314 17 : GDALDataset *OGRParquetWriterLayer::GetDataset()
1315 : {
1316 17 : return m_poDataset;
1317 : }
|