summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/i18n/bocsu.cpp
blob: 861a76a0427536185d885f4e0fae044df2e422bf (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*   Copyright (C) 2001-2014, International Business Machines
*   Corporation and others.  All Rights Reserved.
*******************************************************************************
*   file name:  bocsu.cpp
*   encoding:   UTF-8
*   tab size:   8 (not used)
*   indentation:4
*
*   Author: Markus W. Scherer
*
*   Modification history:
*   05/18/2001  weiv    Made into separate module
*/


#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

#include "unicode/bytestream.h"
#include "unicode/utf16.h"
#include "bocsu.h"

/*
 * encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes,
 * preserving lexical order
 */
static uint8_t *
u_writeDiff(int32_t diff, uint8_t *p) {
    if(diff>=SLOPE_REACH_NEG_1) {
        if(diff<=SLOPE_REACH_POS_1) {
            *p++=(uint8_t)(SLOPE_MIDDLE+diff);
        } else if(diff<=SLOPE_REACH_POS_2) {
            *p++=(uint8_t)(SLOPE_START_POS_2+(diff/SLOPE_TAIL_COUNT));
            *p++=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);
        } else if(diff<=SLOPE_REACH_POS_3) {
            p[2]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);
            diff/=SLOPE_TAIL_COUNT;
            p[1]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);
            *p=(uint8_t)(SLOPE_START_POS_3+(diff/SLOPE_TAIL_COUNT));
            p+=3;
        } else {
            p[3]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);
            diff/=SLOPE_TAIL_COUNT;
            p[2]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);
            diff/=SLOPE_TAIL_COUNT;
            p[1]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);
            *p=SLOPE_MAX;
            p+=4;
        }
    } else {
        int32_t m;

        if(diff>=SLOPE_REACH_NEG_2) {
            NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);
            *p++=(uint8_t)(SLOPE_START_NEG_2+diff);
            *p++=(uint8_t)(SLOPE_MIN+m);
        } else if(diff>=SLOPE_REACH_NEG_3) {
            NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);
            p[2]=(uint8_t)(SLOPE_MIN+m);
            NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);
            p[1]=(uint8_t)(SLOPE_MIN+m);
            *p=(uint8_t)(SLOPE_START_NEG_3+diff);
            p+=3;
        } else {
            NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);
            p[3]=(uint8_t)(SLOPE_MIN+m);
            NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);
            p[2]=(uint8_t)(SLOPE_MIN+m);
            NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);
            p[1]=(uint8_t)(SLOPE_MIN+m);
            *p=SLOPE_MIN;
            p+=4;
        }
    }
    return p;
}

/*
 * Encode the code points of a string as
 * a sequence of byte-encoded differences (slope detection),
 * preserving lexical order.
 *
 * Optimize the difference-taking for runs of Unicode text within
 * small scripts:
 *
 * Most small scripts are allocated within aligned 128-blocks of Unicode
 * code points. Lexical order is preserved if "prev" is always moved
 * into the middle of such a block.
 *
 * Additionally, "prev" is moved from anywhere in the Unihan
 * area into the middle of that area.
 * Note that the identical-level run in a sort key is generated from
 * NFD text - there are never Hangul characters included.
 */
U_CFUNC UChar32
u_writeIdenticalLevelRun(UChar32 prev, const UChar *s, int32_t length, icu::ByteSink &sink) {
    char scratch[64];
    int32_t capacity;

    int32_t i=0;
    while(i<length) {
        char *buffer=sink.GetAppendBuffer(1, length*2, scratch, (int32_t)sizeof(scratch), &capacity);
        uint8_t *p;
        // We must have capacity>=SLOPE_MAX_BYTES in case u_writeDiff() writes that much,
        // but we do not want to force the sink.GetAppendBuffer() to allocate
        // for a large min_capacity because we might actually only write one byte.
        if(capacity<16) {
            buffer=scratch;
            capacity=(int32_t)sizeof(scratch);
        }
        p=reinterpret_cast<uint8_t *>(buffer);
        uint8_t *lastSafe=p+capacity-SLOPE_MAX_BYTES;
        while(i<length && p<=lastSafe) {
            if(prev<0x4e00 || prev>=0xa000) {
                prev=(prev&~0x7f)-SLOPE_REACH_NEG_1;
            } else {
                /*
                 * Unihan U+4e00..U+9fa5:
                 * double-bytes down from the upper end
                 */
                prev=0x9fff-SLOPE_REACH_POS_2;
            }

            UChar32 c;
            U16_NEXT(s, i, length, c);
            if(c==0xfffe) {
                *p++=2;  // merge separator
                prev=0;
            } else {
                p=u_writeDiff(c-prev, p);
                prev=c;
            }
        }
        sink.Append(buffer, (int32_t)(p-reinterpret_cast<uint8_t *>(buffer)));
    }
    return prev;
}

#endif /* #if !UCONFIG_NO_COLLATION */