summaryrefslogtreecommitdiffstats
path: root/lib/search/hex.c
blob: 50af6fb605c4dd6e0c151197f008dc2247c9d390 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
/*
   Search text engine.
   HEX-style pattern matching

   Copyright (C) 2009-2023
   Free Software Foundation, Inc.

   Written by:
   Slava Zanko <slavazanko@gmail.com>, 2009.

   This file is part of the Midnight Commander.

   The Midnight Commander is free software: you can redistribute it
   and/or modify it under the terms of the GNU General Public License as
   published by the Free Software Foundation, either version 3 of the License,
   or (at your option) any later version.

   The Midnight Commander is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <config.h>

#include <stdio.h>

#include "lib/global.h"
#include "lib/strutil.h"
#include "lib/search.h"
#include "lib/strescape.h"

#include "internal.h"

/*** global variables ****************************************************************************/

/*** file scope macro definitions ****************************************************************/

typedef enum
{
    MC_SEARCH_HEX_E_OK,
    MC_SEARCH_HEX_E_NUM_OUT_OF_RANGE,
    MC_SEARCH_HEX_E_INVALID_CHARACTER,
    MC_SEARCH_HEX_E_UNMATCHED_QUOTES
} mc_search_hex_parse_error_t;

/*** file scope type declarations ****************************************************************/

/*** forward declarations (file scope functions) *************************************************/

/*** file scope variables ************************************************************************/

/* --------------------------------------------------------------------------------------------- */
/*** file scope functions ************************************************************************/
/* --------------------------------------------------------------------------------------------- */

static GString *
mc_search__hex_translate_to_regex (const GString * astr, mc_search_hex_parse_error_t * error_ptr,
                                   int *error_pos_ptr)
{
    GString *buff;
    const char *str;
    gsize str_len;
    gsize loop = 0;
    mc_search_hex_parse_error_t error = MC_SEARCH_HEX_E_OK;

    buff = g_string_sized_new (64);
    str = astr->str;
    str_len = astr->len;

    while (loop < str_len && error == MC_SEARCH_HEX_E_OK)
    {
        unsigned int val;
        int ptr;

        if (g_ascii_isspace (str[loop]))
        {
            /* Eat-up whitespace between tokens. */
            while (g_ascii_isspace (str[loop]))
                loop++;
        }
        /* cppcheck-suppress invalidscanf */
        else if (sscanf (str + loop, "%x%n", &val, &ptr) == 1)
        {
            if (val > 255)
                error = MC_SEARCH_HEX_E_NUM_OUT_OF_RANGE;
            else
            {
                g_string_append_printf (buff, "\\x%02X", val);
                loop += ptr;
            }
        }
        else if (str[loop] == '"')
        {
            gsize loop2;

            loop2 = loop + 1;

            while (loop2 < str_len)
            {
                if (str[loop2] == '"')
                    break;
                if (str[loop2] == '\\' && loop2 + 1 < str_len)
                    loop2++;
                g_string_append_c (buff, str[loop2]);
                loop2++;
            }

            if (str[loop2] == '\0')
                error = MC_SEARCH_HEX_E_UNMATCHED_QUOTES;
            else
                loop = loop2 + 1;
        }
        else
            error = MC_SEARCH_HEX_E_INVALID_CHARACTER;
    }

    if (error != MC_SEARCH_HEX_E_OK)
    {
        g_string_free (buff, TRUE);
        if (error_ptr != NULL)
            *error_ptr = error;
        if (error_pos_ptr != NULL)
            *error_pos_ptr = loop;
        return NULL;
    }

    return buff;
}

/* --------------------------------------------------------------------------------------------- */
/*** public functions ****************************************************************************/
/* --------------------------------------------------------------------------------------------- */

void
mc_search__cond_struct_new_init_hex (const char *charset, mc_search_t * lc_mc_search,
                                     mc_search_cond_t * mc_search_cond)
{
    GString *tmp;
    mc_search_hex_parse_error_t error = MC_SEARCH_HEX_E_OK;
    int error_pos = 0;

    /*
     * We may be searching in binary data, which is often invalid UTF-8.
     *
     * We have to create a non UTF-8 regex (that is, G_REGEX_RAW) or else, as
     * the data is invalid UTF-8, both GLib's PCRE and our
     * mc_search__g_regex_match_full_safe() are going to fail us. The former by
     * not finding all bytes, the latter by overwriting the supposedly invalid
     * UTF-8 with NULs.
     *
     * To do this, we specify "ASCII" as the charset.
     *
     * In fact, we can specify any charset other than "UTF-8": any such charset
     * will trigger G_REGEX_RAW (see [1]). The output of [2] will be the same
     * for all charsets because it skips the \xXX symbols
     * mc_search__hex_translate_to_regex() outputs.
     *
     * But "ASCII" is the best choice because a hex pattern may contain a
     * quoted string: this way we know [2] will ignore any characters outside
     * ASCII letters range (these ignored chars will be copied verbatim to the
     * output and will match as-is; in other words, in a case-sensitive manner;
     * If the user is interested in case-insensitive searches of international
     * text, he shouldn't be using hex search in the first place.)
     *
     * Switching out of UTF-8 has another advantage:
     *
     * When doing case-insensitive searches, GLib treats \xXX symbols as normal
     * letters and therefore matches both "a" and "A" for the hex pattern
     * "0x61". When we switch out of UTF-8, we're switching to using [2], which
     * doesn't have this issue.
     *
     * [1] mc_search__cond_struct_new_init_regex
     * [2] mc_search__cond_struct_new_regex_ci_str
     */
    if (str_isutf8 (charset))
        charset = "ASCII";

    tmp = mc_search__hex_translate_to_regex (mc_search_cond->str, &error, &error_pos);
    if (tmp != NULL)
    {
        g_string_free (mc_search_cond->str, TRUE);
        mc_search_cond->str = tmp;
        mc_search__cond_struct_new_init_regex (charset, lc_mc_search, mc_search_cond);
    }
    else
    {
        const char *desc;

        switch (error)
        {
        case MC_SEARCH_HEX_E_NUM_OUT_OF_RANGE:
            desc =
                _
                ("Number out of range (should be in byte range, 0 <= n <= 0xFF, expressed in hex)");
            break;
        case MC_SEARCH_HEX_E_INVALID_CHARACTER:
            desc = _("Invalid character");
            break;
        case MC_SEARCH_HEX_E_UNMATCHED_QUOTES:
            desc = _("Unmatched quotes character");
            break;
        default:
            desc = "";
        }

        lc_mc_search->error = MC_SEARCH_E_INPUT;
        lc_mc_search->error_str =
            g_strdup_printf (_("Hex pattern error at position %d:\n%s."), error_pos + 1, desc);
    }
}

/* --------------------------------------------------------------------------------------------- */

gboolean
mc_search__run_hex (mc_search_t * lc_mc_search, const void *user_data,
                    gsize start_search, gsize end_search, gsize * found_len)
{
    return mc_search__run_regex (lc_mc_search, user_data, start_search, end_search, found_len);
}

/* --------------------------------------------------------------------------------------------- */

GString *
mc_search_hex_prepare_replace_str (mc_search_t * lc_mc_search, GString * replace_str)
{
    (void) lc_mc_search;

    return mc_g_string_dup (replace_str);
}

/* --------------------------------------------------------------------------------------------- */