summaryrefslogtreecommitdiffstats
path: root/src/tools/gen_keywordlist.pl
blob: e9250b8fb2e2f746e9d8d5535e8afaa063e62335 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#----------------------------------------------------------------------
#
# gen_keywordlist.pl
#	Perl script that transforms a list of keywords into a ScanKeywordList
#	data structure that can be passed to ScanKeywordLookup().
#
# The input is a C header file containing a series of macro calls
#	PG_KEYWORD("keyword", ...)
# Lines not starting with PG_KEYWORD are ignored.  The keywords are
# implicitly numbered 0..N-1 in order of appearance in the header file.
# Currently, the keywords are required to appear in ASCII order.
#
# The output is a C header file that defines a "const ScanKeywordList"
# variable named according to the -v switch ("ScanKeywords" by default).
# The variable is marked "static" unless the -e switch is given.
#
# ScanKeywordList uses hash-based lookup, so this script also selects
# a minimal perfect hash function for the keyword set, and emits a
# static hash function that is referenced in the ScanKeywordList struct.
# The hash function is case-insensitive unless --no-case-fold is specified.
# Note that case folding works correctly only for all-ASCII keywords!
#
#
# Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
# Portions Copyright (c) 1994, Regents of the University of California
#
# src/tools/gen_keywordlist.pl
#
#----------------------------------------------------------------------

use strict;
use warnings;
use Getopt::Long;

use FindBin;
use lib $FindBin::RealBin;

use PerfectHash;

my $output_path = '';
my $extern      = 0;
my $case_fold   = 1;
my $varname     = 'ScanKeywords';

GetOptions(
	'output:s'   => \$output_path,
	'extern'     => \$extern,
	'case-fold!' => \$case_fold,
	'varname:s'  => \$varname) || usage();

my $kw_input_file = shift @ARGV || die "No input file.\n";

# Make sure output_path ends in a slash if needed.
if ($output_path ne '' && substr($output_path, -1) ne '/')
{
	$output_path .= '/';
}

$kw_input_file =~ /(\w+)\.h$/
  || die "Input file must be named something.h.\n";
my $base_filename = $1 . '_d';
my $kw_def_file   = $output_path . $base_filename . '.h';

open(my $kif,   '<', $kw_input_file) || die "$kw_input_file: $!\n";
open(my $kwdef, '>', $kw_def_file)   || die "$kw_def_file: $!\n";

# Opening boilerplate for keyword definition header.
printf $kwdef <<EOM, $base_filename, uc $base_filename, uc $base_filename;
/*-------------------------------------------------------------------------
 *
 * %s.h
 *    List of keywords represented as a ScanKeywordList.
 *
 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * NOTES
 *  ******************************
 *  *** DO NOT EDIT THIS FILE! ***
 *  ******************************
 *
 *  It has been GENERATED by src/tools/gen_keywordlist.pl
 *
 *-------------------------------------------------------------------------
 */

#ifndef %s_H
#define %s_H

#include "common/kwlookup.h"

EOM

# Parse input file for keyword names.
my @keywords;
while (<$kif>)
{
	if (/^PG_KEYWORD\("(\w+)"/)
	{
		push @keywords, $1;
	}
}

# When being case-insensitive, insist that the input be all-lower-case.
if ($case_fold)
{
	foreach my $kw (@keywords)
	{
		die qq|The keyword "$kw" is not lower-case in $kw_input_file\n|
		  if ($kw ne lc $kw);
	}
}

# Error out if the keyword names are not in ASCII order.
#
# While this isn't really necessary with hash-based lookup, it's still
# helpful because it provides a cheap way to reject duplicate keywords.
# Also, insisting on sorted order ensures that code that scans the keyword
# table linearly will see the keywords in a canonical order.
for my $i (0 .. $#keywords - 1)
{
	die
	  qq|The keyword "$keywords[$i + 1]" is out of order in $kw_input_file\n|
	  if ($keywords[$i] cmp $keywords[ $i + 1 ]) >= 0;
}

# Emit the string containing all the keywords.

printf $kwdef qq|static const char %s_kw_string[] =\n\t"|, $varname;
print $kwdef join qq|\\0"\n\t"|, @keywords;
print $kwdef qq|";\n\n|;

# Emit an array of numerical offsets which will be used to index into the
# keyword string.  Also determine max keyword length.

printf $kwdef "static const uint16 %s_kw_offsets[] = {\n", $varname;

my $offset  = 0;
my $max_len = 0;
foreach my $name (@keywords)
{
	my $this_length = length($name);

	print $kwdef "\t$offset,\n";

	# Calculate the cumulative offset of the next keyword,
	# taking into account the null terminator.
	$offset += $this_length + 1;

	# Update max keyword length.
	$max_len = $this_length if $max_len < $this_length;
}

print $kwdef "};\n\n";

# Emit a macro defining the number of keywords.
# (In some places it's useful to have access to that as a constant.)

printf $kwdef "#define %s_NUM_KEYWORDS %d\n\n", uc $varname, scalar @keywords;

# Emit the definition of the hash function.

my $funcname = $varname . "_hash_func";

my $f = PerfectHash::generate_hash_function(\@keywords, $funcname,
	case_fold => $case_fold);

printf $kwdef qq|static %s\n|, $f;

# Emit the struct that wraps all this lookup info into one variable.

printf $kwdef "static " if !$extern;
printf $kwdef "const ScanKeywordList %s = {\n", $varname;
printf $kwdef qq|\t%s_kw_string,\n|,            $varname;
printf $kwdef qq|\t%s_kw_offsets,\n|,           $varname;
printf $kwdef qq|\t%s,\n|,                      $funcname;
printf $kwdef qq|\t%s_NUM_KEYWORDS,\n|,         uc $varname;
printf $kwdef qq|\t%d\n|,                       $max_len;
printf $kwdef "};\n\n";

printf $kwdef "#endif\t\t\t\t\t\t\t/* %s_H */\n", uc $base_filename;


sub usage
{
	die <<EOM;
Usage: gen_keywordlist.pl [--output/-o <path>] [--varname/-v <varname>] [--extern/-e] [--[no-]case-fold] input_file
    --output        Output directory (default '.')
    --varname       Name for ScanKeywordList variable (default 'ScanKeywords')
    --extern        Allow the ScanKeywordList variable to be globally visible
    --no-case-fold  Keyword matching is to be case-sensitive

gen_keywordlist.pl transforms a list of keywords into a ScanKeywordList.
The output filename is derived from the input file by inserting _d,
for example kwlist_d.h is produced from kwlist.h.
EOM
}