summaryrefslogtreecommitdiffstats
path: root/third_party/rust/rure/examples/iter.c
blob: 47c83e806f8bcc99a57b719123a831a29ae4df40 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
/*
 * This example code shows how to iterate over all regex matches in a file,
 * emit the match location and print the contents of a capturing group.
 */

#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

#include "rure.h"

int main() {
    /* Open a file and mmap it. */
    int fd = open("sherlock.txt", O_RDONLY);
    if (fd == -1) {
        perror("failed to open sherlock.txt");
        exit(1);
    }
    struct stat status;
    if (fstat(fd, &status) == -1) {
        perror("failed to stat sherlock.txt");
        exit(1);
    }
    if ((uintmax_t)status.st_size > SIZE_MAX) {
        perror("file too big");
        exit(1);
    }
    if (status.st_size == 0) {
        perror("file empty");
        exit(1);
    }
    size_t sherlock_len = (size_t)status.st_size;
    const uint8_t *sherlock = (const uint8_t *)mmap(
        NULL, status.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
    close(fd);
    if (sherlock == MAP_FAILED) {
        perror("could not mmap file");
        exit(1);
    }

    /*
     * Compile the regular expression. A more convenient routine,
     * rure_compile_must, is also available, which will abort the process if
     * and print an error message to stderr if the regex compilation fails.
     * We show the full gory details here as an example.
     */
    const char *pattern = "(\\w+)\\s+Holmes";
    size_t pattern_len = strlen(pattern);
    rure_error *err = rure_error_new();
    rure *re = rure_compile((const uint8_t *)pattern, pattern_len,
                            RURE_FLAG_UNICODE | RURE_FLAG_CASEI, NULL, err);
    if (NULL == re) {
        /* A null regex means compilation failed and an error exists. */
        printf("compilation of %s failed: %s\n",
               pattern, rure_error_message(err));
        rure_error_free(err);
        munmap((char*)sherlock, sherlock_len);
        exit(1);
    }
    rure_error_free(err);

    /*
     * Create an iterator to find all successive non-overlapping matches.
     * For each match, we extract the location of the capturing group.
     */
    rure_match group0 = {0};
    rure_match group1 = {0};
    rure_captures *caps = rure_captures_new(re);
    rure_iter *it = rure_iter_new(re);

    while (rure_iter_next_captures(it, sherlock, sherlock_len, caps)) {
        /*
         * Get the location of the full match and the capturing group.
         * We know that both accesses are successful since the body of the
         * loop only executes if there is a match and both capture groups
         * must match in order for the entire regex to match.
         *
         * N.B. The zeroth group corresponds to the full match of the regex.
         */
        rure_captures_at(caps, 0, &group0);
        rure_captures_at(caps, 1, &group1);
        printf("%.*s (match at: %zu, %zu)\n",
               (int)(group1.end - group1.start),
               sherlock + group1.start,
               group0.start, group0.end);
    }

    /* Free all our resources. */
    munmap((char*)sherlock, sherlock_len);
    rure_captures_free(caps);
    rure_iter_free(it);
    rure_free(re);
    return 0;
}