summaryrefslogtreecommitdiffstats
path: root/src/regexp/syntax/make_perl_groups.pl
blob: 80a2c9ae6b9af5c57d8872dbe5448af1c13ce2dc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/perl
# Copyright 2008 The Go Authors. All rights reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.

# Modified version of RE2's make_perl_groups.pl.

# Generate table entries giving character ranges
# for POSIX/Perl character classes.  Rather than
# figure out what the definition is, it is easier to ask
# Perl about each letter from 0-128 and write down
# its answer.

@posixclasses = (
	"[:alnum:]",
	"[:alpha:]",
	"[:ascii:]",
	"[:blank:]",
	"[:cntrl:]",
	"[:digit:]",
	"[:graph:]",
	"[:lower:]",
	"[:print:]",
	"[:punct:]",
	"[:space:]",
	"[:upper:]",
	"[:word:]",
	"[:xdigit:]",
);

@perlclasses = (
	"\\d",
	"\\s",
	"\\w",
);

%overrides = (
	# Prior to Perl 5.18, \s did not match vertical tab.
	# RE2 preserves that original behaviour.
	"\\s:11" => 0,
);

sub ComputeClass($) {
  my @ranges;
  my ($class) = @_;
  my $regexp = "[$class]";
  my $start = -1;
  for (my $i=0; $i<=129; $i++) {
    if ($i == 129) { $i = 256; }
    if ($i <= 128 && ($overrides{"$class:$i"} // chr($i) =~ $regexp)) {
      if ($start < 0) {
        $start = $i;
      }
    } else {
      if ($start >= 0) {
        push @ranges, [$start, $i-1];
      }
      $start = -1;
    }
  }
  return @ranges;
}

sub PrintClass($$@) {
  my ($cname, $name, @ranges) = @_;
  print "var code$cname = []rune{  /* $name */\n";
  for (my $i=0; $i<@ranges; $i++) {
    my @a = @{$ranges[$i]};
    printf "\t0x%x, 0x%x,\n", $a[0], $a[1];
  }
  print "}\n\n";
  my $n = @ranges;
  $negname = $name;
  if ($negname =~ /:/) {
    $negname =~ s/:/:^/;
  } else {
    $negname =~ y/a-z/A-Z/;
  }
  return "\t`$name`: {+1, code$cname},\n" .
  	"\t`$negname`: {-1, code$cname},\n";
}

my $gen = 0;

sub PrintClasses($@) {
  my ($cname, @classes) = @_;
  my @entries;
  foreach my $cl (@classes) {
    my @ranges = ComputeClass($cl);
    push @entries, PrintClass(++$gen, $cl, @ranges);
  }
  print "var ${cname}Group = map[string]charGroup{\n";
  foreach my $e (@entries) {
    print $e;
  }
  print "}\n";
  my $count = @entries;
}

print <<EOF;
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
// make_perl_groups.pl >perl_groups.go

package syntax

EOF

PrintClasses("perl", @perlclasses);
PrintClasses("posix", @posixclasses);