summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/common/locdistance.h
blob: 51b777e6272312ff75f99595bb7ebbbace8c7c5b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html

// locdistance.h
// created: 2019may08 Markus W. Scherer

#ifndef __LOCDISTANCE_H__
#define __LOCDISTANCE_H__

#include "unicode/utypes.h"
#include "unicode/bytestrie.h"
#include "unicode/localematcher.h"
#include "unicode/locid.h"
#include "unicode/uobject.h"
#include "lsr.h"

U_NAMESPACE_BEGIN

struct LocaleDistanceData;

/**
 * Offline-built data for LocaleMatcher.
 * Mostly but not only the data for mapping locales to their maximized forms.
 */
class LocaleDistance final : public UMemory {
public:
    static const LocaleDistance *getSingleton(UErrorCode &errorCode);

    static int32_t shiftDistance(int32_t distance) {
        return distance << DISTANCE_SHIFT;
    }

    static int32_t getShiftedDistance(int32_t indexAndDistance) {
        return indexAndDistance & DISTANCE_MASK;
    }

    static double getDistanceDouble(int32_t indexAndDistance) {
        double shiftedDistance = getShiftedDistance(indexAndDistance);
        return shiftedDistance / (1 << DISTANCE_SHIFT);
    }

    static int32_t getDistanceFloor(int32_t indexAndDistance) {
        return (indexAndDistance & DISTANCE_MASK) >> DISTANCE_SHIFT;
    }

    static int32_t getIndex(int32_t indexAndDistance) {
        // assert indexAndDistance >= 0;
        return indexAndDistance >> INDEX_SHIFT;
    }

    /**
     * Finds the supported LSR with the smallest distance from the desired one.
     * Equivalent LSR subtags must be normalized into a canonical form.
     *
     * <p>Returns the index of the lowest-distance supported LSR in the high bits
     * (negative if none has a distance below the threshold),
     * and its distance (0..ABOVE_THRESHOLD) in the low bits.
     */
    int32_t getBestIndexAndDistance(const LSR &desired,
                                    const LSR **supportedLSRs, int32_t supportedLSRsLength,
                                    int32_t shiftedThreshold,
                                    ULocMatchFavorSubtag favorSubtag,
                                    ULocMatchDirection direction) const;

    UBool isParadigmLSR(const LSR &lsr) const;

    int32_t getDefaultScriptDistance() const {
        return defaultScriptDistance;
    }

    int32_t getDefaultDemotionPerDesiredLocale() const {
        return defaultDemotionPerDesiredLocale;
    }

private:
    // The distance is shifted left to gain some fraction bits.
    static constexpr int32_t DISTANCE_SHIFT = 3;
    static constexpr int32_t DISTANCE_FRACTION_MASK = 7;
    // 7 bits for 0..100
    static constexpr int32_t DISTANCE_INT_SHIFT = 7;
    static constexpr int32_t INDEX_SHIFT = DISTANCE_INT_SHIFT + DISTANCE_SHIFT;
    static constexpr int32_t DISTANCE_MASK = 0x3ff;
    // tic constexpr int32_t MAX_INDEX = 0x1fffff;  // avoids sign bit
    static constexpr int32_t INDEX_NEG_1 = 0xfffffc00;

    LocaleDistance(const LocaleDistanceData &data, const XLikelySubtags &likely);
    LocaleDistance(const LocaleDistance &other) = delete;
    LocaleDistance &operator=(const LocaleDistance &other) = delete;

    static void initLocaleDistance(UErrorCode &errorCode);

    UBool isMatch(const LSR &desired, const LSR &supported,
                  int32_t shiftedThreshold, ULocMatchFavorSubtag favorSubtag) const {
        const LSR *pSupp = &supported;
        return getBestIndexAndDistance(
            desired, &pSupp, 1,
            shiftedThreshold, favorSubtag, ULOCMATCH_DIRECTION_WITH_ONE_WAY) >= 0;
    }

    static int32_t getDesSuppScriptDistance(BytesTrie &iter, uint64_t startState,
                                            const char *desired, const char *supported);

    static int32_t getRegionPartitionsDistance(
        BytesTrie &iter, uint64_t startState,
        const char *desiredPartitions, const char *supportedPartitions,
        int32_t threshold);

    static int32_t getFallbackRegionDistance(BytesTrie &iter, uint64_t startState);

    static int32_t trieNext(BytesTrie &iter, const char *s, bool wantValue);

    const char *partitionsForRegion(const LSR &lsr) const {
        // ill-formed region -> one non-matching string
        int32_t pIndex = regionToPartitionsIndex[lsr.regionIndex];
        return partitionArrays[pIndex];
    }

    int32_t getDefaultRegionDistance() const {
        return defaultRegionDistance;
    }

    const XLikelySubtags &likelySubtags;

    // The trie maps each dlang+slang+dscript+sscript+dregion+sregion
    // (encoded in ASCII with bit 7 set on the last character of each subtag) to a distance.
    // There is also a trie value for each subsequence of whole subtags.
    // One '*' is used for a (desired, supported) pair of "und", "Zzzz"/"", or "ZZ"/"".
    BytesTrie trie;

    /**
     * Maps each region to zero or more single-character partitions.
     */
    const uint8_t *regionToPartitionsIndex;
    const char **partitionArrays;

    /**
     * Used to get the paradigm region for a cluster, if there is one.
     */
    const LSR *paradigmLSRs;
    int32_t paradigmLSRsLength;

    int32_t defaultLanguageDistance;
    int32_t defaultScriptDistance;
    int32_t defaultRegionDistance;
    int32_t minRegionDistance;
    int32_t defaultDemotionPerDesiredLocale;
};

U_NAMESPACE_END

#endif  // __LOCDISTANCE_H__