1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
|
#ifndef _RURE_H
#define _RURE_H
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#ifdef __cplusplus
extern "C" {
#endif
/*
* rure is the type of a compiled regular expression.
*
* An rure can be safely used from multiple threads simultaneously.
*/
typedef struct rure rure;
/*
* rure_set is the type of a set of compiled regular expressions.
*
* A rure can be safely used from multiple threads simultaneously.
*/
typedef struct rure_set rure_set;
/*
* rure_options is the set of non-flag configuration options for compiling
* a regular expression. Currently, only two options are available: setting
* the size limit of the compiled program and setting the size limit of the
* cache of states that the DFA uses while searching.
*
* For most uses, the default settings will work fine, and NULL can be passed
* wherever a *rure_options is expected.
*/
typedef struct rure_options rure_options;
/*
* The flags listed below can be used in rure_compile to set the default
* flags. All flags can otherwise be toggled in the expression itself using
* standard syntax, e.g., `(?i)` turns case insensitive matching on and `(?-i)`
* disables it.
*/
/* The case insensitive (i) flag. */
#define RURE_FLAG_CASEI (1 << 0)
/* The multi-line matching (m) flag. (^ and $ match new line boundaries.) */
#define RURE_FLAG_MULTI (1 << 1)
/* The any character (s) flag. (. matches new line.) */
#define RURE_FLAG_DOTNL (1 << 2)
/* The greedy swap (U) flag. (e.g., + is ungreedy and +? is greedy.) */
#define RURE_FLAG_SWAP_GREED (1 << 3)
/* The ignore whitespace (x) flag. */
#define RURE_FLAG_SPACE (1 << 4)
/* The Unicode (u) flag. */
#define RURE_FLAG_UNICODE (1 << 5)
/* The default set of flags enabled when no flags are set. */
#define RURE_DEFAULT_FLAGS RURE_FLAG_UNICODE
/*
* rure_match corresponds to the location of a single match in a haystack.
*/
typedef struct rure_match {
/* The start position. */
size_t start;
/* The end position. */
size_t end;
} rure_match;
/*
* rure_captures represents storage for sub-capture locations of a match.
*
* Computing the capture groups of a match can carry a significant performance
* penalty, so their use in the API is optional.
*
* An rure_captures value can be reused in multiple calls to rure_find_captures,
* so long as it is used with the compiled regular expression that created
* it.
*
* An rure_captures value may outlive its corresponding rure and can be freed
* independently.
*
* It is not safe to use from multiple threads simultaneously.
*/
typedef struct rure_captures rure_captures;
/*
* rure_iter is an iterator over successive non-overlapping matches in a
* particular haystack.
*
* An rure_iter value may not outlive its corresponding rure and should be freed
* before its corresponding rure is freed.
*
* It is not safe to use from multiple threads simultaneously.
*/
typedef struct rure_iter rure_iter;
/*
* rure_iter_capture_names is an iterator over the list of capture group names
* in this particular rure.
*
* An rure_iter_capture_names value may not outlive its corresponding rure,
* and should be freed before its corresponding rure is freed.
*
* It is not safe to use from multiple threads simultaneously.
*/
typedef struct rure_iter_capture_names rure_iter_capture_names;
/*
* rure_error is an error that caused compilation to fail.
*
* Most errors are syntax errors but an error can be returned if the compiled
* regular expression would be too big.
*
* Whenever a function accepts an *rure_error, it is safe to pass NULL. (But
* you will not get access to the error if one occurred.)
*
* It is not safe to use from multiple threads simultaneously.
*/
typedef struct rure_error rure_error;
/*
* rure_compile_must compiles the given pattern into a regular expression. If
* compilation fails for any reason, an error message is printed to stderr and
* the process is aborted.
*
* The pattern given should be in UTF-8. For convenience, this accepts a C
* string, which means the pattern cannot usefully contain NUL. If your pattern
* may contain NUL, consider using a regular expression escape sequence, or
* just use rure_compile.
*
* This uses RURE_DEFAULT_FLAGS.
*
* The compiled expression returned may be used from multiple threads
* simultaneously.
*/
rure *rure_compile_must(const char *pattern);
/*
* rure_compile compiles the given pattern into a regular expression. The
* pattern must be valid UTF-8 and the length corresponds to the number of
* bytes in the pattern.
*
* flags is a bitfield. Valid values are constants declared with prefix
* RURE_FLAG_.
*
* options contains non-flag configuration settings. If it's NULL, default
* settings are used. options may be freed immediately after a call to
* rure_compile.
*
* error is set if there was a problem compiling the pattern (including if the
* pattern is not valid UTF-8). If error is NULL, then no error information
* is returned. In all cases, if an error occurs, NULL is returned.
*
* The compiled expression returned may be used from multiple threads
* simultaneously.
*/
rure *rure_compile(const uint8_t *pattern, size_t length,
uint32_t flags, rure_options *options,
rure_error *error);
/*
* rure_free frees the given compiled regular expression.
*
* This must be called at most once for any rure.
*/
void rure_free(rure *re);
/*
* rure_is_match returns true if and only if re matches anywhere in haystack.
*
* haystack may contain arbitrary bytes, but ASCII compatible text is more
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
* length should be the number of bytes in haystack.
*
* start is the position at which to start searching. Note that setting the
* start position is distinct from incrementing the pointer, since the regex
* engine may look at bytes before the start position to determine match
* information. For example, if the start position is greater than 0, then the
* \A ("begin text") anchor can never match.
*
* rure_is_match should be preferred to rure_find since it may be faster.
*
* N.B. The performance of this search is not impacted by the presence of
* capturing groups in your regular expression.
*/
bool rure_is_match(rure *re, const uint8_t *haystack, size_t length,
size_t start);
/*
* rure_find returns true if and only if re matches anywhere in haystack.
* If a match is found, then its start and end offsets (in bytes) are set
* on the match pointer given.
*
* haystack may contain arbitrary bytes, but ASCII compatible text is more
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
* length should be the number of bytes in haystack.
*
* start is the position at which to start searching. Note that setting the
* start position is distinct from incrementing the pointer, since the regex
* engine may look at bytes before the start position to determine match
* information. For example, if the start position is greater than 0, then the
* \A ("begin text") anchor can never match.
*
* rure_find should be preferred to rure_find_captures since it may be faster.
*
* N.B. The performance of this search is not impacted by the presence of
* capturing groups in your regular expression.
*/
bool rure_find(rure *re, const uint8_t *haystack, size_t length,
size_t start, rure_match *match);
/*
* rure_find_captures returns true if and only if re matches anywhere in
* haystack. If a match is found, then all of its capture locations are stored
* in the captures pointer given.
*
* haystack may contain arbitrary bytes, but ASCII compatible text is more
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
* length should be the number of bytes in haystack.
*
* start is the position at which to start searching. Note that setting the
* start position is distinct from incrementing the pointer, since the regex
* engine may look at bytes before the start position to determine match
* information. For example, if the start position is greater than 0, then the
* \A ("begin text") anchor can never match.
*
* Only use this function if you specifically need access to capture locations.
* It is not necessary to use this function just because your regular
* expression contains capturing groups.
*
* Capture locations can be accessed using the rure_captures_* functions.
*
* N.B. The performance of this search can be impacted by the number of
* capturing groups. If you're using this function, it may be beneficial to
* use non-capturing groups (e.g., `(?:re)`) where possible.
*/
bool rure_find_captures(rure *re, const uint8_t *haystack, size_t length,
size_t start, rure_captures *captures);
/*
* rure_shortest_match returns true if and only if re matches anywhere in
* haystack. If a match is found, then its end location is stored in the
* pointer given. The end location is the place at which the regex engine
* determined that a match exists, but may occur before the end of the proper
* leftmost-first match.
*
* haystack may contain arbitrary bytes, but ASCII compatible text is more
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
* length should be the number of bytes in haystack.
*
* start is the position at which to start searching. Note that setting the
* start position is distinct from incrementing the pointer, since the regex
* engine may look at bytes before the start position to determine match
* information. For example, if the start position is greater than 0, then the
* \A ("begin text") anchor can never match.
*
* rure_shortest_match should be preferred to rure_find since it may be faster.
*
* N.B. The performance of this search is not impacted by the presence of
* capturing groups in your regular expression.
*/
bool rure_shortest_match(rure *re, const uint8_t *haystack, size_t length,
size_t start, size_t *end);
/*
* rure_capture_name_index returns the capture index for the name given. If
* no such named capturing group exists in re, then -1 is returned.
*
* The capture index may be used with rure_captures_at.
*
* This function never returns 0 since the first capture group always
* corresponds to the entire match and is always unnamed.
*/
int32_t rure_capture_name_index(rure *re, const char *name);
/*
* rure_iter_capture_names_new creates a new capture_names iterator.
*
* An iterator will report all successive capture group names of re.
*/
rure_iter_capture_names *rure_iter_capture_names_new(rure *re);
/*
* rure_iter_capture_names_free frees the iterator given.
*
* It must be called at most once.
*/
void rure_iter_capture_names_free(rure_iter_capture_names *it);
/*
* rure_iter_capture_names_next advances the iterator and returns true
* if and only if another capture group name exists.
*
* The value of the capture group name is written to the provided pointer.
*/
bool rure_iter_capture_names_next(rure_iter_capture_names *it, char **name);
/*
* rure_iter_new creates a new iterator.
*
* An iterator will report all successive non-overlapping matches of re.
* When calling iterator functions, the same haystack and length must be
* supplied to all invocations. (Strict pointer equality is, however, not
* required.)
*/
rure_iter *rure_iter_new(rure *re);
/*
* rure_iter_free frees the iterator given.
*
* It must be called at most once.
*/
void rure_iter_free(rure_iter *it);
/*
* rure_iter_next advances the iterator and returns true if and only if a
* match was found. If a match is found, then the match pointer is set with the
* start and end location of the match, in bytes.
*
* If no match is found, then subsequent calls will return false indefinitely.
*
* haystack may contain arbitrary bytes, but ASCII compatible text is more
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
* length should be the number of bytes in haystack. The given haystack must
* be logically equivalent to all other haystacks given to this iterator.
*
* rure_iter_next should be preferred to rure_iter_next_captures since it may
* be faster.
*
* N.B. The performance of this search is not impacted by the presence of
* capturing groups in your regular expression.
*/
bool rure_iter_next(rure_iter *it, const uint8_t *haystack, size_t length,
rure_match *match);
/*
* rure_iter_next_captures advances the iterator and returns true if and only if a
* match was found. If a match is found, then all of its capture locations are
* stored in the captures pointer given.
*
* If no match is found, then subsequent calls will return false indefinitely.
*
* haystack may contain arbitrary bytes, but ASCII compatible text is more
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
* length should be the number of bytes in haystack. The given haystack must
* be logically equivalent to all other haystacks given to this iterator.
*
* Only use this function if you specifically need access to capture locations.
* It is not necessary to use this function just because your regular
* expression contains capturing groups.
*
* Capture locations can be accessed using the rure_captures_* functions.
*
* N.B. The performance of this search can be impacted by the number of
* capturing groups. If you're using this function, it may be beneficial to
* use non-capturing groups (e.g., `(?:re)`) where possible.
*/
bool rure_iter_next_captures(rure_iter *it,
const uint8_t *haystack, size_t length,
rure_captures *captures);
/*
* rure_captures_new allocates storage for all capturing groups in re.
*
* An rure_captures value may be reused on subsequent calls to
* rure_find_captures or rure_iter_next_captures.
*
* An rure_captures value may be freed independently of re, although any
* particular rure_captures should be used only with the re given here.
*
* It is not safe to use an rure_captures value from multiple threads
* simultaneously.
*/
rure_captures *rure_captures_new(rure *re);
/*
* rure_captures_free frees the given captures.
*
* This must be called at most once.
*/
void rure_captures_free(rure_captures *captures);
/*
* rure_captures_at returns true if and only if the capturing group at the
* index given was part of a match. If so, the given match pointer is populated
* with the start and end location (in bytes) of the capturing group.
*
* If no capture group with the index i exists, then false is
* returned. (A capturing group exists if and only if i is less than
* rure_captures_len(captures).)
*
* Note that index 0 corresponds to the full match.
*/
bool rure_captures_at(rure_captures *captures, size_t i, rure_match *match);
/*
* rure_captures_len returns the number of capturing groups in the given
* captures.
*/
size_t rure_captures_len(rure_captures *captures);
/*
* rure_options_new allocates space for options.
*
* Options may be freed immediately after a call to rure_compile, but otherwise
* may be freely used in multiple calls to rure_compile.
*
* It is not safe to set options from multiple threads simultaneously. It is
* safe to call rure_compile from multiple threads simultaneously using the
* same options pointer.
*/
rure_options *rure_options_new();
/*
* rure_options_free frees the given options.
*
* This must be called at most once.
*/
void rure_options_free(rure_options *options);
/*
* rure_options_size_limit sets the appoximate size limit of the compiled
* regular expression.
*
* This size limit roughly corresponds to the number of bytes occupied by a
* single compiled program. If the program would exceed this number, then a
* compilation error will be returned from rure_compile.
*/
void rure_options_size_limit(rure_options *options, size_t limit);
/*
* rure_options_dfa_size_limit sets the approximate size of the cache used by
* the DFA during search.
*
* This roughly corresponds to the number of bytes that the DFA will use while
* searching.
*
* Note that this is a *per thread* limit. There is no way to set a global
* limit. In particular, if a regular expression is used from multiple threads
* simultaneously, then each thread may use up to the number of bytes
* specified here.
*/
void rure_options_dfa_size_limit(rure_options *options, size_t limit);
/*
* rure_compile_set compiles the given list of patterns into a single regular
* expression which can be matched in a linear-scan. Each pattern in patterns
* must be valid UTF-8 and the length of each pattern in patterns corresponds
* to a byte length in patterns_lengths.
*
* The number of patterns to compile is specified by patterns_count. patterns
* must contain at least this many entries.
*
* flags is a bitfield. Valid values are constants declared with prefix
* RURE_FLAG_.
*
* options contains non-flag configuration settings. If it's NULL, default
* settings are used. options may be freed immediately after a call to
* rure_compile.
*
* error is set if there was a problem compiling the pattern.
*
* The compiled expression set returned may be used from multiple threads.
*/
rure_set *rure_compile_set(const uint8_t **patterns,
const size_t *patterns_lengths,
size_t patterns_count,
uint32_t flags,
rure_options *options,
rure_error *error);
/*
* rure_set_free frees the given compiled regular expression set.
*
* This must be called at most once for any rure_set.
*/
void rure_set_free(rure_set *re);
/*
* rure_is_match returns true if and only if any regexes within the set
* match anywhere in the haystack. Once a match has been located, the
* matching engine will quit immediately.
*
* haystack may contain arbitrary bytes, but ASCII compatible text is more
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
* length should be the number of bytes in haystack.
*
* start is the position at which to start searching. Note that setting the
* start position is distinct from incrementing the pointer, since the regex
* engine may look at bytes before the start position to determine match
* information. For example, if the start position is greater than 0, then the
* \A ("begin text") anchor can never match.
*/
bool rure_set_is_match(rure_set *re, const uint8_t *haystack, size_t length,
size_t start);
/*
* rure_set_matches compares each regex in the set against the haystack and
* modifies matches with the match result of each pattern. Match results are
* ordered in the same way as the rure_set was compiled. For example,
* index 0 of matches corresponds to the first pattern passed to
* `rure_compile_set`.
*
* haystack may contain arbitrary bytes, but ASCII compatible text is more
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
* length should be the number of bytes in haystack.
*
* start is the position at which to start searching. Note that setting the
* start position is distinct from incrementing the pointer, since the regex
* engine may look at bytes before the start position to determine match
* information. For example, if the start position is greater than 0, then the
* \A ("begin text") anchor can never match.
*
* matches must be greater than or equal to the number of patterns the
* rure_set was compiled with.
*
* Only use this function if you specifically need to know which regexes
* matched within the set. To determine if any of the regexes matched without
* caring which, use rure_set_is_match.
*/
bool rure_set_matches(rure_set *re, const uint8_t *haystack, size_t length,
size_t start, bool *matches);
/*
* rure_set_len returns the number of patterns rure_set was compiled with.
*/
size_t rure_set_len(rure_set *re);
/*
* rure_error_new allocates space for an error.
*
* If error information is desired, then rure_error_new should be called
* to create an rure_error pointer, and that pointer can be passed to
* rure_compile. If an error occurred, then rure_compile will return NULL and
* the error pointer will be set. A message can then be extracted.
*
* It is not safe to use errors from multiple threads simultaneously. An error
* value may be reused on subsequent calls to rure_compile.
*/
rure_error *rure_error_new();
/*
* rure_error_free frees the error given.
*
* This must be called at most once.
*/
void rure_error_free(rure_error *err);
/*
* rure_error_message returns a NUL terminated string that describes the error
* message.
*
* The pointer returned must not be freed. Instead, it will be freed when
* rure_error_free is called. If err is used in subsequent calls to
* rure_compile, then this pointer may change or become invalid.
*/
const char *rure_error_message(rure_error *err);
/*
* rure_escape_must returns a NUL terminated string where all meta characters
* have been escaped. If escaping fails for any reason, an error message is
* printed to stderr and the process is aborted.
*
* The pattern given should be in UTF-8. For convenience, this accepts a C
* string, which means the pattern cannot contain a NUL byte. These correspond
* to the only two failure conditions of this function. That is, if the caller
* guarantees that the given pattern is valid UTF-8 and does not contain a
* NUL byte, then this is guaranteed to succeed (modulo out-of-memory errors).
*
* The pointer returned must not be freed directly. Instead, it should be freed
* by calling rure_cstring_free.
*/
const char *rure_escape_must(const char *pattern);
/*
* rure_cstring_free frees the string given.
*
* This must be called at most once per string.
*/
void rure_cstring_free(char *s);
#ifdef __cplusplus
}
#endif
#endif
|