summaryrefslogtreecommitdiffstats
path: root/src/parser/xml_namespace.cpp
blob: 2aafea3d8dba55642cd20205e63489c42d36e8f6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 */

#include <orcus/xml_namespace.hpp>
#include <orcus/exception.hpp>
#include <orcus/string_pool.hpp>

#include <unordered_map>
#include <vector>
#include <limits>
#include <sstream>
#include <algorithm>
#include <cassert>

#define ORCUS_DEBUG_XML_NAMESPACE 0

using namespace std;

#if ORCUS_DEBUG_XML_NAMESPACE
#include <cstdio>
#include <iostream>
#endif

namespace orcus {

namespace {

#if ORCUS_DEBUG_XML_NAMESPACE
template<typename _MapType>
void print_map_keys(const _MapType& map_store)
{
    cout << "keys: (";
    bool first = true;
    typename _MapType::const_iterator it = map_store.begin(), it_end = map_store.end();
    for (; it != it_end; ++it)
    {
        if (first)
            first = false;
        else
            cout << " ";
        cout << "'" << it->first << "'";
    }
    cout << ")";
};
#endif

}

typedef std::unordered_map<std::string_view, std::size_t> strid_map_type;

struct xmlns_repository::impl
{
    size_t m_predefined_ns_size;
    string_pool m_pool; /// storage of live string instances.
    std::vector<std::string_view> m_identifiers; /// map strings to numerical identifiers.
    strid_map_type m_strid_map; /// string-to-numerical identifiers map for quick lookup.

    impl() : m_predefined_ns_size(0) {}
};

xmlns_repository::xmlns_repository() : mp_impl(std::make_unique<impl>()) {}
xmlns_repository::xmlns_repository(xmlns_repository&& other) : mp_impl(std::move(other.mp_impl)) {}
xmlns_repository::~xmlns_repository() = default;

xmlns_repository& xmlns_repository::operator= (xmlns_repository&& other)
{
    mp_impl = std::move(other.mp_impl);
    return *this;
}

xmlns_id_t xmlns_repository::intern(std::string_view uri)
{
    // See if the uri is already registered.
    strid_map_type::iterator it = mp_impl->m_strid_map.find(uri);
    if (it != mp_impl->m_strid_map.end())
        return it->first.data();

    try
    {
        auto r = mp_impl->m_pool.intern(uri);
        std::string_view uri_interned = r.first;

        if (!uri_interned.empty())
        {
            // Intern successful.
            if (r.second)
            {
                // This is a new instance. Assign a numerical identifier.
                mp_impl->m_strid_map.insert(
                    strid_map_type::value_type(r.first, mp_impl->m_identifiers.size()));
#if ORCUS_DEBUG_XML_NAMESPACE
                cout << "xmlns_repository::intern: uri='" << uri_interned << "' (" << mp_impl->m_identifiers.size() << ")" << endl;
#endif
                mp_impl->m_identifiers.push_back(r.first);

#if ORCUS_DEBUG_XML_NAMESPACE
                cout << "pool size=" << mp_impl->m_pool.size() << ", predefined ns size=" << mp_impl->m_predefined_ns_size <<
                    ", identifiers size=" << mp_impl->m_identifiers.size() << ", map size=" << mp_impl->m_strid_map.size() << endl;
#endif
                assert(mp_impl->m_pool.size()+mp_impl->m_predefined_ns_size == mp_impl->m_identifiers.size());
                assert(mp_impl->m_pool.size()+mp_impl->m_predefined_ns_size == mp_impl->m_strid_map.size());
            }
            return uri_interned.data();
        }
    }
    catch (const general_error&)
    {
    }

    return XMLNS_UNKNOWN_ID;
}

void xmlns_repository::add_predefined_values(const xmlns_id_t* predefined_ns)
{
    if (!predefined_ns)
        return;

    const xmlns_id_t* val = &predefined_ns[0];
    for (; *val; ++val)
    {
        std::string_view s(*val);
        mp_impl->m_strid_map.insert(
            strid_map_type::value_type(s, mp_impl->m_identifiers.size()));
        mp_impl->m_identifiers.push_back(s);

        ++mp_impl->m_predefined_ns_size;

#if ORCUS_DEBUG_XML_NAMESPACE
        cout << "xlmns_repository: predefined ns='" << s << "'" << endl;
#endif
    }
}

xmlns_context xmlns_repository::create_context()
{
    return xmlns_context(*this);
}

xmlns_id_t xmlns_repository::get_identifier(size_t index) const
{
    if (index >= mp_impl->m_identifiers.size())
        return XMLNS_UNKNOWN_ID;

    // All identifier strings are interned which means they are all null-terminated.
    return mp_impl->m_identifiers[index].data();
}

string xmlns_repository::get_short_name(xmlns_id_t ns_id) const
{
    size_t index = get_index(ns_id);

    if (index == INDEX_NOT_FOUND)
        return string("???");

    std::ostringstream os;
    os << "ns" << index;
    return os.str();
}

size_t xmlns_repository::get_index(xmlns_id_t ns_id) const
{
    if (!ns_id)
        return INDEX_NOT_FOUND;

    auto it = mp_impl->m_strid_map.find(std::string_view(ns_id));
    if (it == mp_impl->m_strid_map.end())
        return INDEX_NOT_FOUND;

    return it->second;
}

typedef std::vector<xmlns_id_t> xmlns_list_type;
typedef std::unordered_map<std::string_view, xmlns_list_type> alias_map_type;

struct xmlns_context::impl
{
    xmlns_repository* repo = nullptr;
    xmlns_list_type m_all_ns; /// all namespaces ever used in this context.
    xmlns_list_type m_default;
    alias_map_type m_map;

    bool m_trim_all_ns = true;

    impl() {}
    impl(xmlns_repository& _repo) : repo(&_repo) {}
    impl(const impl& r) :
        repo(r.repo), m_all_ns(r.m_all_ns), m_default(r.m_default), m_map(r.m_map), m_trim_all_ns(r.m_trim_all_ns) {}
};

xmlns_context::xmlns_context() : mp_impl(std::make_unique<impl>()) {}
xmlns_context::xmlns_context(xmlns_repository& repo) : mp_impl(std::make_unique<impl>(repo)) {}
xmlns_context::xmlns_context(const xmlns_context& r) : mp_impl(std::make_unique<impl>(*r.mp_impl)) {}
xmlns_context::xmlns_context(xmlns_context&& r) : mp_impl(std::move(r.mp_impl))
{
    r.mp_impl = std::make_unique<impl>();
}

xmlns_context::~xmlns_context() = default;

xmlns_context& xmlns_context::operator= (const xmlns_context& r)
{
    xmlns_context tmp(r);
    tmp.swap(*this);
    return *this;
}

xmlns_context& xmlns_context::operator= (xmlns_context&& r)
{
    xmlns_context tmp(std::move(r));
    tmp.swap(*this);
    return *this;
}

xmlns_id_t xmlns_context::push(std::string_view alias, std::string_view uri)
{
    if (!mp_impl->repo)
        throw general_error("this context is not associated with any repo.");

#if ORCUS_DEBUG_XML_NAMESPACE
    cout << "xmlns_context::push: key='" << alias << "', uri='" << uri << "'" << endl;
#endif
    mp_impl->m_trim_all_ns = true;

    xmlns_id_t id = mp_impl->repo->intern(uri);
    std::string_view uri_interned = id ? std::string_view(id) : std::string_view();

    if (alias.empty())
    {
        // empty alias value is associated with default namespace.
        mp_impl->m_default.push_back(uri_interned.data());
        mp_impl->m_all_ns.push_back(uri_interned.data());
        return mp_impl->m_default.back();
    }

    // See if this alias already exists.
    alias_map_type::iterator it = mp_impl->m_map.find(alias);
    if (it == mp_impl->m_map.end())
    {
        // This is the first time this alias is used.
        xmlns_list_type nslist;
        nslist.push_back(uri_interned.data());
        mp_impl->m_all_ns.push_back(uri_interned.data());
        std::pair<alias_map_type::iterator,bool> r =
            mp_impl->m_map.insert(alias_map_type::value_type(alias, nslist));

        if (!r.second)
            // insertion failed.
            throw general_error("Failed to insert new namespace.");

        return nslist.back();
    }

    // The alias already exists.
    xmlns_list_type& nslist = it->second;
    nslist.push_back(uri_interned.data());
    mp_impl->m_all_ns.push_back(uri_interned.data());
    return nslist.back();
}

void xmlns_context::pop(std::string_view alias)
{
#if ORCUS_DEBUG_XML_NAMESPACE
    cout << "xmlns_context::pop: alias='" << alias << "'" << endl;
#endif
    if (alias.empty())
    {
        // empty alias value is associated with default namespace.
        if (mp_impl->m_default.empty())
            throw general_error("default namespace stack is empty.");

        mp_impl->m_default.pop_back();
        return;
    }

    // See if this alias really exists.
    alias_map_type::iterator it = mp_impl->m_map.find(alias);
    if (it == mp_impl->m_map.end())
    {
        std::ostringstream os;
        os << "alias named '" << alias << "' was attempted to be popped, but was not found in the stack";
        throw general_error(os.str());
    }

    xmlns_list_type& nslist = it->second;
    if (nslist.empty())
        throw general_error("namespace stack for this key is empty.");

    nslist.pop_back();
}

xmlns_id_t xmlns_context::get(std::string_view alias) const
{
#if ORCUS_DEBUG_XML_NAMESPACE
    cout << "xmlns_context::get: alias='" << alias << "', default ns stack size="
        << mp_impl->m_default.size() << ", non-default alias count=" << mp_impl->m_map.size();
    cout << ", ";
    print_map_keys(mp_impl->m_map);
    cout << endl;
#endif
    if (alias.empty())
        return mp_impl->m_default.empty() ? XMLNS_UNKNOWN_ID : mp_impl->m_default.back();

    alias_map_type::const_iterator it = mp_impl->m_map.find(alias);
    if (it == mp_impl->m_map.end())
    {
#if ORCUS_DEBUG_XML_NAMESPACE
        cout << "xmlns_context::get: alias not in this context" << endl;
#endif
        return XMLNS_UNKNOWN_ID;
    }

#if ORCUS_DEBUG_XML_NAMESPACE
    cout << "xmlns_context::get: alias stack size=" << it->second.size() << endl;
#endif
    return it->second.empty() ? XMLNS_UNKNOWN_ID : it->second.back();
}

size_t xmlns_context::get_index(xmlns_id_t ns_id) const
{
    if (!mp_impl->repo)
        throw general_error("this context is not associated with any repo.");

    return mp_impl->repo->get_index(ns_id);
}

string xmlns_context::get_short_name(xmlns_id_t ns_id) const
{
    if (!mp_impl->repo)
        throw general_error("this context is not associated with any repo.");

    return mp_impl->repo->get_short_name(ns_id);
}

std::string_view xmlns_context::get_alias(xmlns_id_t ns_id) const
{
    alias_map_type::const_iterator it = mp_impl->m_map.begin(), it_end = mp_impl->m_map.end();
    for (; it != it_end; ++it)
    {
        const xmlns_list_type& lst = it->second;
        if (lst.empty())
            continue;

        if (lst.back() == ns_id)
            return it->first;
    }

    return std::string_view{};
}

namespace {

#if ORCUS_DEBUG_XML_NAMESPACE
struct print_ns
{
    void operator() (xmlns_id_t ns_id) const
    {
        const char* p = ns_id;
        printf("%p: %s\n", p, p);
    }
};
#endif

struct ns_item
{
    size_t index;
    xmlns_id_t ns;

    ns_item(size_t _index, xmlns_id_t _ns) : index(_index), ns(_ns) {}
};

struct less_ns_by_index
{
    bool operator() (const ns_item& left, const ns_item& right) const
    {
        return left.index < right.index;
    }
};

class push_back_ns_to_item
{
    vector<ns_item>& m_store;
    const xmlns_context& m_cxt;
public:
    push_back_ns_to_item(vector<ns_item>& store, const xmlns_context& cxt) : m_store(store), m_cxt(cxt) {}
    void operator() (xmlns_id_t ns)
    {
        size_t num_id = m_cxt.get_index(ns);
        if (num_id != INDEX_NOT_FOUND)
            m_store.push_back(ns_item(num_id, ns));
    }
};

class push_back_item_to_ns
{
    std::vector<xmlns_id_t>& m_store;
public:
    push_back_item_to_ns(std::vector<xmlns_id_t>& store) : m_store(store) {}
    void operator() (const ns_item& item)
    {
        m_store.push_back(item.ns);
    }
};

}

std::vector<xmlns_id_t> xmlns_context::get_all_namespaces() const
{
#if ORCUS_DEBUG_XML_NAMESPACE
    cout << "xmlns_context::get_all_namespaces: count=" << mp_impl->m_all_ns.size() << endl;
    std::for_each(mp_impl->m_all_ns.begin(), mp_impl->m_all_ns.end(), print_ns());
#endif

    std::vector<xmlns_id_t> nslist;

    if (mp_impl->m_trim_all_ns)
    {
        xmlns_list_type& all_ns = mp_impl->m_all_ns;

        nslist.assign(mp_impl->m_all_ns.begin(), mp_impl->m_all_ns.end());

        // Sort it and remove duplicate.
        std::sort(all_ns.begin(), all_ns.end());
        xmlns_list_type::iterator it_unique_end =
            std::unique(all_ns.begin(), all_ns.end());
        all_ns.erase(it_unique_end, all_ns.end());

        // Now, sort by indices.
        vector<ns_item> items;
        std::for_each(all_ns.begin(), all_ns.end(), push_back_ns_to_item(items, *this));
        std::sort(items.begin(), items.end(), less_ns_by_index());

        all_ns.clear();
        std::for_each(items.begin(), items.end(), push_back_item_to_ns(all_ns));

        mp_impl->m_trim_all_ns = false;
    }

    nslist.assign(mp_impl->m_all_ns.begin(), mp_impl->m_all_ns.end());
    return nslist;
}

void xmlns_context::dump(std::ostream& os) const
{
    vector<xmlns_id_t> nslist = get_all_namespaces();
    vector<xmlns_id_t>::const_iterator it = nslist.begin(), it_end = nslist.end();
    for (; it != it_end; ++it)
    {
        xmlns_id_t ns_id = *it;
        size_t num_id = get_index(ns_id);
        if (num_id == INDEX_NOT_FOUND)
            continue;

        os << "ns" << num_id << "=\"" << ns_id << '"' << endl;
    }
}

void xmlns_context::dump_state(std::ostream& os) const
{
    os << "namespaces:" << std::endl;
    for (xmlns_id_t ns_id : get_all_namespaces())
    {
        size_t num_id = get_index(ns_id);
        if (num_id == INDEX_NOT_FOUND)
            continue;

        os << "  ns" << num_id << ": \"" << ns_id << '"' << std::endl;
    }

    os << "aliases:" << std::endl;
    for (const auto& [alias, ns_list] : mp_impl->m_map)
    {
        os << "  " << alias << ":" << std::endl;

        for (const xmlns_id_t ns : ns_list)
            os << "    - " << ns << std::endl;
    }
}

void xmlns_context::swap(xmlns_context& other) noexcept
{
    mp_impl.swap(other.mp_impl);
}

}

/* vim:set shiftwidth=4 softtabstop=4 expandtab: */