1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
|
// Copyright (c) 2001-2010 Hartmut Kaiser
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
// This example shows how to create a simple lexer recognizing a couple of
// different tokens aimed at a simple language and how to use this lexer with
// a grammar. It shows how to associate attributes to tokens and how to access the
// token attributes from inside the grammar.
//
// Additionally, this example demonstrates, how to define a token set usable
// as the skip parser during parsing, allowing to define several tokens to be
// ignored.
//
// The main purpose of this example is to show how inheritance can be used to
// overload parts of a base grammar and add token definitions to a base lexer.
//
// Further, it shows how you can use the 'omit' attribute type specifier
// for token definitions to force the token to have no attribute (expose an
// unused attribute).
//
// This example recognizes a very simple programming language having
// assignment statements and if and while control structures. Look at the file
// example5.input for an example.
#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <iostream>
#include <fstream>
#include <string>
#include "example.hpp"
using namespace boost::spirit;
using boost::phoenix::val;
///////////////////////////////////////////////////////////////////////////////
// Token definition base, defines all tokens for the base grammar below
///////////////////////////////////////////////////////////////////////////////
template <typename Lexer>
struct example5_base_tokens : lex::lexer<Lexer>
{
protected:
// this lexer is supposed to be used as a base type only
example5_base_tokens() {}
public:
void init_token_definitions()
{
// define the tokens to match
identifier = "[a-zA-Z_][a-zA-Z0-9_]*";
constant = "[0-9]+";
if_ = "if";
while_ = "while";
// associate the tokens and the token set with the lexer
this->self += lex::token_def<>('(') | ')' | '{' | '}' | '=' | ';' | constant;
this->self += if_ | while_ | identifier;
// define the whitespace to ignore (spaces, tabs, newlines and C-style
// comments)
this->self("WS")
= lex::token_def<>("[ \\t\\n]+")
| "\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/"
;
}
// these tokens have no attribute
lex::token_def<lex::omit> if_, while_;
// The following two tokens have an associated attribute type, 'identifier'
// carries a string (the identifier name) and 'constant' carries the
// matched integer value.
//
// Note: any token attribute type explicitly specified in a token_def<>
// declaration needs to be listed during token type definition as
// well (see the typedef for the token_type below).
//
// The conversion of the matched input to an instance of this type occurs
// once (on first access), which makes token attributes as efficient as
// possible. Moreover, token instances are constructed once by the lexer
// library. From this point on tokens are passed by reference only,
// avoiding them being copied around.
lex::token_def<std::string> identifier;
lex::token_def<unsigned int> constant;
};
///////////////////////////////////////////////////////////////////////////////
// Grammar definition base, defines a basic language
///////////////////////////////////////////////////////////////////////////////
template <typename Iterator, typename Lexer>
struct example5_base_grammar
: qi::grammar<Iterator, qi::in_state_skipper<Lexer> >
{
template <typename TokenDef>
example5_base_grammar(TokenDef const& tok)
: example5_base_grammar::base_type(program)
{
using boost::spirit::_val;
program
= +block
;
block
= '{' >> *statement >> '}'
;
statement
= assignment
| if_stmt
| while_stmt
;
assignment
= (tok.identifier >> '=' >> expression >> ';')
[
std::cout << val("assignment statement to: ") << _1 << "\n"
]
;
if_stmt
= (tok.if_ >> '(' >> expression >> ')' >> block)
[
std::cout << val("if expression: ") << _1 << "\n"
]
;
while_stmt
= (tok.while_ >> '(' >> expression >> ')' >> block)
[
std::cout << val("while expression: ") << _1 << "\n"
]
;
// since expression has a variant return type accommodating for
// std::string and unsigned integer, both possible values may be
// returned to the calling rule
expression
= tok.identifier [ _val = _1 ]
| tok.constant [ _val = _1 ]
;
}
typedef qi::in_state_skipper<Lexer> skipper_type;
qi::rule<Iterator, skipper_type> program, block, statement;
qi::rule<Iterator, skipper_type> assignment, if_stmt;
qi::rule<Iterator, skipper_type> while_stmt;
// the expression is the only rule having a return value
typedef boost::variant<unsigned int, std::string> expression_type;
qi::rule<Iterator, expression_type(), skipper_type> expression;
};
///////////////////////////////////////////////////////////////////////////////
// Token definition for derived lexer, defines additional tokens
///////////////////////////////////////////////////////////////////////////////
template <typename Lexer>
struct example5_tokens : example5_base_tokens<Lexer>
{
typedef example5_base_tokens<Lexer> base_type;
example5_tokens()
{
// define the additional token to match
else_ = "else";
// associate the new token with the lexer, note we add 'else' before
// anything else to add it to the token set before the identifier
// token, otherwise "else" would be matched as an identifier
this->self = else_;
// now add the token definitions from the base class
this->base_type::init_token_definitions();
}
// this token has no attribute
lex::token_def<lex::omit> else_;
};
///////////////////////////////////////////////////////////////////////////////
// Derived grammar definition, defines a language extension
///////////////////////////////////////////////////////////////////////////////
template <typename Iterator, typename Lexer>
struct example5_grammar : example5_base_grammar<Iterator, Lexer>
{
template <typename TokenDef>
example5_grammar(TokenDef const& tok)
: example5_base_grammar<Iterator, Lexer>(tok)
{
// we alter the if_stmt only
this->if_stmt
= this->if_stmt.copy() >> -(tok.else_ >> this->block)
;
}
};
///////////////////////////////////////////////////////////////////////////////
int main()
{
// iterator type used to expose the underlying input stream
typedef std::string::iterator base_iterator_type;
// This is the lexer token type to use. The second template parameter lists
// all attribute types used for token_def's during token definition (see
// example5_base_tokens<> above). Here we use the predefined lexertl token
// type, but any compatible token type may be used instead.
//
// If you don't list any token attribute types in the following declaration
// (or just use the default token type: lexertl_token<base_iterator_type>)
// it will compile and work just fine, just a bit less efficient. This is
// because the token attribute will be generated from the matched input
// sequence every time it is requested. But as soon as you specify at
// least one token attribute type you'll have to list all attribute types
// used for token_def<> declarations in the token definition class above,
// otherwise compilation errors will occur.
typedef lex::lexertl::token<
base_iterator_type, boost::mpl::vector<unsigned int, std::string>
> token_type;
// Here we use the lexertl based lexer engine.
typedef lex::lexertl::lexer<token_type> lexer_type;
// This is the token definition type (derived from the given lexer type).
typedef example5_tokens<lexer_type> example5_tokens;
// this is the iterator type exposed by the lexer
typedef example5_tokens::iterator_type iterator_type;
// this is the type of the grammar to parse
typedef example5_grammar<iterator_type, example5_tokens::lexer_def> example5_grammar;
// now we use the types defined above to create the lexer and grammar
// object instances needed to invoke the parsing process
example5_tokens tokens; // Our lexer
example5_grammar calc(tokens); // Our parser
std::string str (read_from_file("example5.input"));
// At this point we generate the iterator pair used to expose the
// tokenized input stream.
std::string::iterator it = str.begin();
iterator_type iter = tokens.begin(it, str.end());
iterator_type end = tokens.end();
// Parsing is done based on the token stream, not the character
// stream read from the input.
// Note how we use the lexer defined above as the skip parser. It must
// be explicitly wrapped inside a state directive, switching the lexer
// state for the duration of skipping whitespace.
std::string ws("WS");
bool r = qi::phrase_parse(iter, end, calc, qi::in_state(ws)[tokens.self]);
if (r && iter == end)
{
std::cout << "-------------------------\n";
std::cout << "Parsing succeeded\n";
std::cout << "-------------------------\n";
}
else
{
std::cout << "-------------------------\n";
std::cout << "Parsing failed\n";
std::cout << "-------------------------\n";
}
std::cout << "Bye... :-) \n\n";
return 0;
}
|