summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/common/uniquecharstr.h
blob: 10cc924f7f9eb2f6e6c80c97e3d3ccd1153b8c6f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
// © 2020 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html

// uniquecharstr.h
// created: 2020sep01 Frank Yung-Fong Tang

#ifndef __UNIQUECHARSTR_H__
#define __UNIQUECHARSTR_H__

#include "charstr.h"
#include "uassert.h"
#include "uhash.h"

U_NAMESPACE_BEGIN

/**
 * Stores NUL-terminated strings with duplicate elimination.
 * Checks for unique UTF-16 string pointers and converts to invariant characters.
 *
 * Intended to be stack-allocated. Add strings, get a unique number for each,
 * freeze the object, get a char * pointer for each string,
 * call orphanCharStrings() to capture the string storage, and let this object go out of scope.
 */
class UniqueCharStrings {
public:
    UniqueCharStrings(UErrorCode &errorCode) : strings(nullptr) {
        // Note: We hash on string contents but store stable char16_t * pointers.
        // If the strings are stored in resource bundles which should be built with
        // duplicate elimination, then we should be able to hash on just the pointer values.
        uhash_init(&map, uhash_hashUChars, uhash_compareUChars, uhash_compareLong, &errorCode);
        if (U_FAILURE(errorCode)) { return; }
        strings = new CharString();
        if (strings == nullptr) {
            errorCode = U_MEMORY_ALLOCATION_ERROR;
        }
    }
    ~UniqueCharStrings() {
        uhash_close(&map);
        delete strings;
    }

    /** Returns/orphans the CharString that contains all strings. */
    CharString *orphanCharStrings() {
        CharString *result = strings;
        strings = nullptr;
        return result;
    }

    /**
     * Adds a string and returns a unique number for it.
     * The string's buffer contents must not change, nor move around in memory,
     * while this UniqueCharStrings is in use.
     * The string contents must be NUL-terminated exactly at s.length().
     *
     * Best used with read-only-alias UnicodeString objects that point to
     * stable storage, such as strings returned by resource bundle functions.
     */
    int32_t add(const UnicodeString &s, UErrorCode &errorCode) {
        if (U_FAILURE(errorCode)) { return 0; }
        if (isFrozen) {
            errorCode = U_NO_WRITE_PERMISSION;
            return 0;
        }
        // The string points into the resource bundle.
        const char16_t *p = s.getBuffer();
        int32_t oldIndex = uhash_geti(&map, p);
        if (oldIndex != 0) {  // found duplicate
            return oldIndex;
        }
        // Explicit NUL terminator for the previous string.
        // The strings object is also terminated with one implicit NUL.
        strings->append(0, errorCode);
        int32_t newIndex = strings->length();
        strings->appendInvariantChars(s, errorCode);
        uhash_puti(&map, const_cast<char16_t *>(p), newIndex, &errorCode);
        return newIndex;
    }

    void freeze() { isFrozen = true; }

    /**
     * Returns a string pointer for its unique number, if this object is frozen.
     * Otherwise nullptr.
     */
    const char *get(int32_t i) const {
        U_ASSERT(isFrozen);
        return isFrozen && i > 0 ? strings->data() + i : nullptr;
    }

private:
    UHashtable map;
    CharString *strings;
    bool isFrozen = false;
};

U_NAMESPACE_END

#endif  // __UNIQUECHARSTR_H__