Line data Source code
1 : /******************************************************************************
2 : *
3 : * Project: GML Reader
4 : * Purpose: Implementation of GMLReader::ResolveXlinks() method.
5 : * Author: Chaitanya kumar CH, chaitanya@osgeo.in
6 : *
7 : ******************************************************************************
8 : * Copyright (c) 2010, Chaitanya kumar CH
9 : * Copyright (c) 2010-2014, Even Rouault <even dot rouault at spatialys.com>
10 : *
11 : * SPDX-License-Identifier: MIT
12 : ****************************************************************************/
13 :
14 : #include "cpl_port.h"
15 : #include "gmlreader.h"
16 : #include "gmlreaderp.h"
17 :
18 : #include <cstddef>
19 : #include <cstring>
20 :
21 : #include "cpl_conv.h"
22 : #include "cpl_error.h"
23 : #include "cpl_http.h"
24 : #include "cpl_minixml.h"
25 : #include "cpl_string.h"
26 :
27 : /************************************************************************/
28 : /* GetID() */
29 : /* */
30 : /* Returns the reference to the gml:id of psNode. NULL if not */
31 : /* found. */
32 : /************************************************************************/
33 :
34 107255 : static const char *GetID(CPLXMLNode *psNode)
35 :
36 : {
37 107255 : if (psNode == nullptr)
38 0 : return nullptr;
39 :
40 261530 : for (CPLXMLNode *psChild = psNode->psChild; psChild != nullptr;
41 154275 : psChild = psChild->psNext)
42 : {
43 165003 : if (psChild->eType == CXT_Attribute &&
44 35151 : EQUAL(psChild->pszValue, "gml:id"))
45 : {
46 10728 : return psChild->psChild->pszValue;
47 : }
48 : }
49 96527 : return nullptr;
50 : }
51 :
52 : /************************************************************************/
53 : /* CompareNodeIDs() */
54 : /* */
55 : /* Compares two nodes by their IDs */
56 : /************************************************************************/
57 :
58 : /*static int CompareNodeIDs( CPLXMLNode * psNode1, CPLXMLNode * psNode2 )
59 :
60 : {
61 : if( psNode2 == NULL )
62 : return TRUE;
63 :
64 : if( psNode1 == NULL )
65 : return FALSE;
66 :
67 : return strcmp( GetID(psNode2), GetID(psNode1) ) > 0;
68 : }*/
69 :
70 : /************************************************************************/
71 : /* BuildIDIndex() */
72 : /* */
73 : /* Returns an array of nodes sorted by their gml:id strings */
74 : /* XXX: This method can be used to build an array of pointers to */
75 : /* nodes sorted by their id values. */
76 : /************************************************************************/
77 : /*
78 : static std::vector<CPLXMLNode*> BuildIDIndex( CPLXMLNode* psNode,
79 : std::vector<CPLXMLNode*> &apsNode )
80 :
81 : {
82 : for( CPLXMLNode *psSibling = psNode;
83 : psSibling != NULL;
84 : psSibling = psSibling->psNext )
85 : {
86 : if( GetID( psSibling ) != NULL )
87 : apsNode.push_back( psSibling );
88 : BuildIDIndex( psNode->psChild, apsNode );
89 : }
90 : return NULL;
91 : }*/
92 :
93 : /************************************************************************/
94 : /* FindElementByID() */
95 : /* */
96 : /* Find a node with the indicated "gml:id" in the node tree and */
97 : /* its siblings. */
98 : /************************************************************************/
99 :
100 100770 : static CPLXMLNode *FindElementByID(CPLXMLNode *psRoot, const char *pszID)
101 :
102 : {
103 100770 : if (psRoot == nullptr)
104 9109 : return nullptr;
105 :
106 : // Check for id attribute.
107 253761 : for (CPLXMLNode *psSibling = psRoot; psSibling != nullptr;
108 162100 : psSibling = psSibling->psNext)
109 : {
110 162413 : if (psSibling->eType == CXT_Element)
111 : {
112 : // check that sibling for id value
113 107255 : const char *pszIDOfSibling = GetID(psSibling);
114 107255 : if (pszIDOfSibling != nullptr && EQUAL(pszIDOfSibling, pszID))
115 313 : return psSibling;
116 : }
117 : }
118 :
119 : // Search the child elements of all the psRoot's siblings.
120 244818 : for (CPLXMLNode *psSibling = psRoot; psSibling != nullptr;
121 153470 : psSibling = psSibling->psNext)
122 : {
123 155557 : if (psSibling->eType == CXT_Element)
124 : {
125 100457 : CPLXMLNode *psReturn = FindElementByID(psSibling->psChild, pszID);
126 100457 : if (psReturn != nullptr)
127 2087 : return psReturn;
128 : }
129 : }
130 89261 : return nullptr;
131 : }
132 :
133 : /************************************************************************/
134 : /* RemoveIDs() */
135 : /* */
136 : /* Remove all the gml:id nodes. Doesn't check psRoot's siblings */
137 : /************************************************************************/
138 :
139 1690 : static void RemoveIDs(CPLXMLNode *psRoot)
140 :
141 : {
142 1690 : if (psRoot == nullptr)
143 0 : return;
144 :
145 1690 : CPLXMLNode *psChild = psRoot->psChild;
146 :
147 : // Check for id attribute.
148 3244 : while (psChild != nullptr && !(psChild->eType == CXT_Attribute &&
149 987 : EQUAL(psChild->pszValue, "gml:id")))
150 1554 : psChild = psChild->psNext;
151 1690 : CPLRemoveXMLChild(psRoot, psChild);
152 1690 : CPLDestroyXMLNode(psChild);
153 :
154 : // Search the child elements of psRoot.
155 3775 : for (psChild = psRoot->psChild; psChild != nullptr;
156 2085 : psChild = psChild->psNext)
157 2085 : if (psChild->eType == CXT_Element)
158 1377 : RemoveIDs(psChild);
159 : }
160 :
161 : /************************************************************************/
162 : /* TrimTree() */
163 : /* */
164 : /* Remove all nodes without a gml:id node in the descendants. */
165 : /* Returns TRUE if there is a gml:id node in the descendants. */
166 : /************************************************************************/
167 :
168 0 : static bool TrimTree(CPLXMLNode *psRoot)
169 :
170 : {
171 0 : if (psRoot == nullptr)
172 0 : return false;
173 :
174 0 : CPLXMLNode *psChild = psRoot->psChild;
175 :
176 : // Check for id attribute.
177 0 : while (psChild != nullptr && !(psChild->eType == CXT_Attribute &&
178 0 : EQUAL(psChild->pszValue, "gml:id")))
179 0 : psChild = psChild->psNext;
180 :
181 0 : if (psChild != nullptr)
182 0 : return true;
183 :
184 : // Search the child elements of psRoot.
185 0 : bool bReturn = false;
186 0 : for (psChild = psRoot->psChild; psChild != nullptr;)
187 : {
188 0 : CPLXMLNode *psNextChild = psChild->psNext;
189 0 : if (psChild->eType == CXT_Element)
190 : {
191 0 : const bool bRemove = TrimTree(psChild);
192 0 : if (bRemove)
193 : {
194 0 : bReturn = bRemove;
195 : }
196 : else
197 : {
198 : // Remove this child.
199 0 : CPLRemoveXMLChild(psRoot, psChild);
200 0 : CPLDestroyXMLNode(psChild);
201 : }
202 : }
203 :
204 0 : psChild = psNextChild;
205 : }
206 0 : return bReturn;
207 : }
208 :
209 : /************************************************************************/
210 : /* CorrectURLs() */
211 : /* */
212 : /* Processes the node and all its children recursively. Siblings of */
213 : /* psRoot are ignored. */
214 : /* - Replaces all every URL in URL#id pairs with pszURL. */
215 : /* - Leaves it alone if the paths are same or the URL is not relative. */
216 : /* - If it is relative, the path from pszURL is prepended. */
217 : /************************************************************************/
218 :
219 3490 : static void CorrectURLs(CPLXMLNode *psRoot, const char *pszURL)
220 :
221 : {
222 3490 : if (psRoot == nullptr || pszURL == nullptr)
223 0 : return;
224 3490 : if (pszURL[0] == '\0')
225 0 : return;
226 :
227 3490 : CPLXMLNode *psChild = psRoot->psChild;
228 :
229 : // Check for xlink:href attribute.
230 8449 : while (psChild != nullptr && !((psChild->eType == CXT_Attribute) &&
231 1552 : (EQUAL(psChild->pszValue, "xlink:href"))))
232 4959 : psChild = psChild->psNext;
233 :
234 3490 : if (psChild != nullptr &&
235 313 : !(strstr(psChild->psChild->pszValue, pszURL) ==
236 313 : psChild->psChild->pszValue &&
237 0 : psChild->psChild->pszValue[strlen(pszURL)] == '#'))
238 : {
239 : // href has a different url.
240 313 : if (psChild->psChild->pszValue[0] == '#')
241 : {
242 : // Empty URL: prepend the given URL.
243 313 : const size_t nLen = CPLStrnlen(pszURL, 1024) +
244 313 : CPLStrnlen(psChild->psChild->pszValue, 1024) +
245 313 : 1;
246 313 : char *pszNew = static_cast<char *>(CPLMalloc(nLen * sizeof(char)));
247 313 : CPLStrlcpy(pszNew, pszURL, nLen);
248 313 : CPLStrlcat(pszNew, psChild->psChild->pszValue, nLen);
249 313 : CPLSetXMLValue(psRoot, "#xlink:href", pszNew);
250 313 : CPLFree(pszNew);
251 : }
252 : else
253 : {
254 0 : size_t nPathLen = strlen(pszURL); // Used after for.
255 0 : for (; nPathLen > 0 && pszURL[nPathLen - 1] != '/' &&
256 0 : pszURL[nPathLen - 1] != '\\';
257 : nPathLen--)
258 : {
259 : }
260 :
261 0 : const char *pszDash = strchr(psChild->psChild->pszValue, '#');
262 0 : if (pszDash != nullptr &&
263 0 : strncmp(pszURL, psChild->psChild->pszValue, nPathLen) != 0)
264 : {
265 : // Different path.
266 0 : const int nURLLen =
267 0 : static_cast<int>(pszDash - psChild->psChild->pszValue);
268 : char *pszURLWithoutID = static_cast<char *>(
269 0 : CPLMalloc((nURLLen + 1) * sizeof(char)));
270 0 : strncpy(pszURLWithoutID, psChild->psChild->pszValue, nURLLen);
271 0 : pszURLWithoutID[nURLLen] = '\0';
272 :
273 0 : if (CPLIsFilenameRelative(pszURLWithoutID) &&
274 0 : strstr(pszURLWithoutID, ":") == nullptr)
275 : {
276 : // Relative URL: prepend the path of pszURL.
277 : const size_t nLen =
278 0 : nPathLen +
279 0 : CPLStrnlen(psChild->psChild->pszValue, 1024) + 1;
280 : char *pszNew =
281 0 : static_cast<char *>(CPLMalloc(nLen * sizeof(char)));
282 0 : for (size_t i = 0; i < nPathLen; i++)
283 0 : pszNew[i] = pszURL[i];
284 0 : pszNew[nPathLen] = '\0';
285 0 : CPLStrlcat(pszNew, psChild->psChild->pszValue, nLen);
286 0 : CPLSetXMLValue(psRoot, "#xlink:href", pszNew);
287 0 : CPLFree(pszNew);
288 : }
289 0 : CPLFree(pszURLWithoutID);
290 : }
291 : }
292 : }
293 :
294 : // Search the child elements of psRoot.
295 8762 : for (psChild = psRoot->psChild; psChild != nullptr;
296 5272 : psChild = psChild->psNext)
297 5272 : if (psChild->eType == CXT_Element)
298 3171 : CorrectURLs(psChild, pszURL);
299 : }
300 :
301 : /************************************************************************/
302 : /* FindTreeByURL() */
303 : /* */
304 : /* Find a doc tree that is located at pszURL. */
305 : /* If not present in ppapsRoot, it updates it and ppapszResourceHREF. */
306 : /************************************************************************/
307 :
308 313 : static CPLXMLNode *FindTreeByURL(CPLXMLNode ***ppapsRoot,
309 : char ***ppapszResourceHREF, const char *pszURL)
310 :
311 : {
312 313 : if (*ppapsRoot == nullptr || ppapszResourceHREF == nullptr)
313 0 : return nullptr;
314 :
315 : // If found in ppapszResourceHREF.
316 313 : const int i = CSLFindString(*ppapszResourceHREF, pszURL);
317 313 : if (i >= 0)
318 : {
319 : // Return corresponding psRoot.
320 313 : return (*ppapsRoot)[i];
321 : }
322 :
323 0 : CPLXMLNode *psSrcTree = nullptr;
324 0 : char *pszLocation = CPLStrdup(pszURL);
325 : // If it is part of filesystem.
326 0 : if (CPLCheckForFile(pszLocation, nullptr))
327 : {
328 : // Filesystem.
329 0 : psSrcTree = CPLParseXMLFile(pszURL);
330 : }
331 0 : else if (CPLHTTPEnabled())
332 : {
333 : // Web resource.
334 0 : CPLErrorReset();
335 0 : CPLHTTPResult *psResult = CPLHTTPFetch(pszURL, nullptr);
336 0 : if (psResult != nullptr)
337 : {
338 0 : if (psResult->nDataLen > 0 && CPLGetLastErrorNo() == 0)
339 0 : psSrcTree = CPLParseXMLString(
340 0 : reinterpret_cast<const char *>(psResult->pabyData));
341 0 : CPLHTTPDestroyResult(psResult);
342 : }
343 : }
344 :
345 : // Report error in case the resource cannot be retrieved.
346 0 : if (psSrcTree == nullptr)
347 0 : CPLError(CE_Failure, CPLE_NotSupported, "Could not access %s",
348 : pszLocation);
349 :
350 0 : CPLFree(pszLocation);
351 :
352 : /************************************************************************/
353 : /* In the external GML resource we will only need elements */
354 : /* identified by a "gml:id". So trim them. */
355 : /************************************************************************/
356 0 : CPLXMLNode *psSibling = psSrcTree;
357 0 : while (psSibling != nullptr)
358 : {
359 0 : TrimTree(psSibling);
360 0 : psSibling = psSibling->psNext;
361 : }
362 :
363 : // Update to lists.
364 0 : int nItems = CSLCount(*ppapszResourceHREF);
365 0 : *ppapszResourceHREF = CSLAddString(*ppapszResourceHREF, pszURL);
366 0 : *ppapsRoot = static_cast<CPLXMLNode **>(
367 0 : CPLRealloc(*ppapsRoot, (nItems + 2) * sizeof(CPLXMLNode *)));
368 0 : (*ppapsRoot)[nItems] = psSrcTree;
369 0 : (*ppapsRoot)[nItems + 1] = nullptr;
370 :
371 : // Return the tree.
372 0 : return (*ppapsRoot)[nItems];
373 : }
374 :
375 : /************************************************************************/
376 : /* ResolveTree() */
377 : /* Resolves the xlinks in a node and its siblings */
378 : /* If any error is encountered or any element is skipped(papszSkip): */
379 : /* If bStrict is TRUE, process is stopped and CE_Error is returned */
380 : /* If bStrict is FALSE, the process is continued but CE_Warning is */
381 : /* returned at the end. */
382 : /* If everything goes fine, CE_None is returned. */
383 : /************************************************************************/
384 :
385 3493 : static CPLErr Resolve(CPLXMLNode *psNode, CPLXMLNode ***ppapsRoot,
386 : char ***ppapszResourceHREF, char **papszSkip,
387 : const int bStrict, int nDepth)
388 :
389 : {
390 : // For each sibling.
391 3493 : CPLXMLNode *psSibling = nullptr;
392 3493 : CPLXMLNode *psResource = nullptr;
393 3493 : CPLXMLNode *psTarget = nullptr;
394 3493 : CPLErr eReturn = CE_None, eReturned;
395 :
396 8771 : for (psSibling = psNode; psSibling != nullptr;
397 5278 : psSibling = psSibling->psNext)
398 : {
399 5278 : if (psSibling->eType != CXT_Element)
400 1788 : continue;
401 :
402 3490 : CPLXMLNode *psChild = psSibling->psChild;
403 8449 : while (psChild != nullptr && !(psChild->eType == CXT_Attribute &&
404 1552 : EQUAL(psChild->pszValue, "xlink:href")))
405 4959 : psChild = psChild->psNext;
406 :
407 : // If a child has a "xlink:href" attribute.
408 3490 : if (psChild != nullptr && psChild->psChild != nullptr)
409 : {
410 313 : if (CSLFindString(papszSkip, psSibling->pszValue) >= 0)
411 : {
412 : // Skipping a specified element.
413 0 : eReturn = CE_Warning;
414 0 : continue;
415 : }
416 :
417 313 : const int nDepthCheck = 256;
418 313 : if (nDepth % nDepthCheck == 0)
419 : {
420 : // A way to track progress.
421 0 : CPLDebug("GML", "Resolving xlinks... (currently %s)",
422 0 : psChild->psChild->pszValue);
423 : }
424 :
425 626 : char **papszTokens = CSLTokenizeString2(
426 313 : psChild->psChild->pszValue, "#",
427 : CSLT_ALLOWEMPTYTOKENS | CSLT_STRIPLEADSPACES |
428 : CSLT_STRIPENDSPACES);
429 313 : if (CSLCount(papszTokens) != 2 || papszTokens[1][0] == '\0')
430 : {
431 0 : CPLError(bStrict ? CE_Failure : CE_Warning, CPLE_NotSupported,
432 : "Error parsing the href %s.%s",
433 0 : psChild->psChild->pszValue,
434 : bStrict ? "" : " Skipping...");
435 0 : CSLDestroy(papszTokens);
436 0 : if (bStrict)
437 0 : return CE_Failure;
438 0 : eReturn = CE_Warning;
439 0 : continue;
440 : }
441 :
442 : // Look for the resource with that URL.
443 : psResource =
444 313 : FindTreeByURL(ppapsRoot, ppapszResourceHREF, papszTokens[0]);
445 313 : if (psResource == nullptr)
446 : {
447 0 : CSLDestroy(papszTokens);
448 0 : if (bStrict)
449 0 : return CE_Failure;
450 0 : eReturn = CE_Warning;
451 0 : continue;
452 : }
453 :
454 : // Look for the element with the ID.
455 313 : psTarget = FindElementByID(psResource, papszTokens[1]);
456 313 : if (psTarget != nullptr)
457 : {
458 : // Remove the xlink:href attribute.
459 313 : CPLRemoveXMLChild(psSibling, psChild);
460 313 : CPLDestroyXMLNode(psChild);
461 :
462 : // Make a copy of psTarget.
463 : CPLXMLNode *psCopy =
464 313 : CPLCreateXMLNode(nullptr, CXT_Element, psTarget->pszValue);
465 313 : psCopy->psChild = CPLCloneXMLTree(psTarget->psChild);
466 313 : RemoveIDs(psCopy);
467 : // Correct empty URLs in URL#id pairs.
468 313 : if (CPLStrnlen(papszTokens[0], 1) > 0)
469 : {
470 313 : CorrectURLs(psCopy, papszTokens[0]);
471 : }
472 313 : CPLAddXMLChild(psSibling, psCopy);
473 313 : CSLDestroy(papszTokens);
474 : }
475 : else
476 : {
477 : // Element not found.
478 0 : CSLDestroy(papszTokens);
479 0 : CPLError(bStrict ? CE_Failure : CE_Warning, CPLE_ObjectNull,
480 : "Couldn't find the element with id %s.",
481 0 : psChild->psChild->pszValue);
482 0 : if (bStrict)
483 0 : return CE_Failure;
484 0 : eReturn = CE_Warning;
485 : }
486 : }
487 :
488 : // Recurse with the first child.
489 3490 : eReturned = Resolve(psSibling->psChild, ppapsRoot, ppapszResourceHREF,
490 : papszSkip, bStrict, nDepth + 1);
491 :
492 3490 : if (eReturned == CE_Failure)
493 0 : return CE_Failure;
494 :
495 3490 : if (eReturned == CE_Warning)
496 0 : eReturn = CE_Warning;
497 : }
498 3493 : return eReturn;
499 : }
500 :
501 : /************************************************************************/
502 : /* ResolveXlinks() */
503 : /* Returns TRUE for success */
504 : /* - Returns CE_None for success, */
505 : /* CE_Warning if the resolved file is saved to a different file or */
506 : /* CE_Failure if it could not be saved at all. */
507 : /* - m_pszFilename will be set to the file the resolved file was */
508 : /* saved to. */
509 : /************************************************************************/
510 :
511 3 : bool GMLReader::ResolveXlinks(const char *pszFile, bool *pbOutIsTempFile,
512 : char **papszSkip, const bool bStrict)
513 :
514 : {
515 3 : *pbOutIsTempFile = false;
516 :
517 : // Check if the original source file is set.
518 3 : if (m_pszFilename == nullptr)
519 : {
520 0 : CPLError(CE_Failure, CPLE_NotSupported,
521 : "GML source file needs to be set first with "
522 : "GMLReader::SetSourceFile().");
523 0 : return false;
524 : }
525 :
526 : /* -------------------------------------------------------------------- */
527 : /* Load the raw XML file into a XML Node tree. */
528 : /* -------------------------------------------------------------------- */
529 : CPLXMLNode **papsSrcTree =
530 3 : static_cast<CPLXMLNode **>(CPLCalloc(2, sizeof(CPLXMLNode *)));
531 3 : papsSrcTree[0] = CPLParseXMLFile(m_pszFilename);
532 :
533 3 : if (papsSrcTree[0] == nullptr)
534 : {
535 0 : CPLFree(papsSrcTree);
536 0 : return false;
537 : }
538 :
539 : // Make all the URLs absolute.
540 3 : CPLXMLNode *psSibling = nullptr;
541 9 : for (psSibling = papsSrcTree[0]; psSibling != nullptr;
542 6 : psSibling = psSibling->psNext)
543 6 : CorrectURLs(psSibling, m_pszFilename);
544 :
545 : // Setup resource data structure.
546 3 : char **papszResourceHREF = nullptr;
547 : // "" is the href of the original source file.
548 3 : papszResourceHREF = CSLAddString(papszResourceHREF, m_pszFilename);
549 :
550 : // Call resolver.
551 3 : const CPLErr eReturned = Resolve(papsSrcTree[0], &papsSrcTree,
552 : &papszResourceHREF, papszSkip, bStrict, 0);
553 :
554 3 : bool bReturn = true;
555 3 : if (eReturned != CE_Failure)
556 : {
557 3 : char *pszTmpName = nullptr;
558 3 : bool bTryWithTempFile = false;
559 3 : if (STARTS_WITH_CI(pszFile, "/vsitar/") ||
560 3 : STARTS_WITH_CI(pszFile, "/vsigzip/") ||
561 3 : STARTS_WITH_CI(pszFile, "/vsizip/") ||
562 3 : STARTS_WITH_CI(pszFile, "/vsicurl"))
563 : {
564 0 : bTryWithTempFile = true;
565 : }
566 3 : else if (!CPLSerializeXMLTreeToFile(papsSrcTree[0], pszFile))
567 : {
568 0 : CPLError(CE_Failure, CPLE_FileIO,
569 : "Cannot serialize resolved file %s to %s.", m_pszFilename,
570 : pszFile);
571 0 : bTryWithTempFile = true;
572 : }
573 :
574 3 : if (bTryWithTempFile)
575 : {
576 : pszTmpName =
577 0 : CPLStrdup(CPLGenerateTempFilenameSafe("ResolvedGML").c_str());
578 0 : if (!CPLSerializeXMLTreeToFile(papsSrcTree[0], pszTmpName))
579 : {
580 0 : CPLError(CE_Failure, CPLE_FileIO,
581 : "Cannot serialize resolved file %s to %s either.",
582 : m_pszFilename, pszTmpName);
583 0 : CPLFree(pszTmpName);
584 0 : bReturn = false;
585 : }
586 : else
587 : {
588 : // Set the source file to the resolved file.
589 0 : CPLFree(m_pszFilename);
590 0 : m_pszFilename = pszTmpName;
591 0 : *pbOutIsTempFile = true;
592 : }
593 : }
594 : else
595 : {
596 : // Set the source file to the resolved file.
597 3 : CPLFree(m_pszFilename);
598 3 : m_pszFilename = CPLStrdup(pszFile);
599 : }
600 : }
601 : else
602 : {
603 0 : bReturn = false;
604 : }
605 :
606 3 : const int nItems = CSLCount(papszResourceHREF);
607 3 : CSLDestroy(papszResourceHREF);
608 6 : for (int i = 0; i < nItems; i++)
609 3 : CPLDestroyXMLNode(papsSrcTree[i]);
610 3 : CPLFree(papsSrcTree);
611 :
612 3 : return bReturn;
613 : }
|