1
/*
2
* \brief Tokenizer support
3
* \author Norman Feske
4
* \date 2006-05-19
5
*/
6
7
/*
8
* Copyright (C) 2006-2013 Genode Labs GmbH
9
*
10
* This file is part of the Genode OS framework, which is distributed
11
* under the terms of the GNU General Public License version 2.
12
*/
13
14
#
ifndef _INCLUDE__UTIL__TOKEN_H_
15
#
define _INCLUDE__UTIL__TOKEN_H_
16
17
#
include <util/string.h>
18
19
namespace
Genode {
20
21
struct
Scanner_policy_identifier_with_underline;
22
template
<
typename
>
class
Token;
23
}
24
25
26
/**
27
* Scanner policy that accepts underline characters in identifiers
28
*/
29
struct
Genode::
Scanner_policy_identifier_with_underline
30
{
31
/**
32
* Return true if character belongs to a valid identifier
33
*
34
* \param c character
35
* \param i index of character in token
36
* \return true if character is a valid identifier character
37
*
38
* Letters and underline characters are allowed anywhere in an
39
* identifier, digits must not appear at the beginning.
40
*/
41
static
bool
identifier_char(
char
c
,
unsigned
i
)
{
42
return
is_letter(
c)
||
(
c ==
`_`
)
||
(
i &&
is_digit(
c)
)
;
}
43
}
;
44
45
46
/**
47
* Token
48
*
49
* This class is used to group characters of a string which belong
50
* to one syntactical token types number, identifier, string,
51
* whitespace or another single character.
52
*
53
* \param SCANNER_POLICY policy that defines the way of token scanning
54
*
55
* See `Scanner_policy_identifier_with_underline` for an example scanner
56
* policy.
57
*/
58
template
<
typename
SCANNER_POLICY
>
59
class
Genode::
Token
60
{
61
public
:
62
63
enum
Type {
SINGLECHAR
,
NUMBER
,
IDENT
,
STRING
,
WHITESPACE
,
END
}
;
64
65
/**
66
* Constructor
67
*
68
* \param s start of string to construct a token from
69
* \param max_len maximum token length
70
*
71
* The `max_len` argument is useful for processing character arrays
72
* that are not null-terminated.
73
*/
74
Token(
const
char *
s
=
0
,
size_t
max_len
=
~
0UL
)
75
:
_start(
s)
,
_max_len(
max_len)
,
_len(
s ?
_calc_len(
max_len)
: 0)
{
}
76
77
/**
78
* Accessors
79
*/
80
char *
start(
)
const
{
return
(
char *
)
_start
;
}
81
size_t
len(
)
const
{
return
_len
;
}
82
Type
type(
)
const
{
return
_type(
_len)
;
}
83
84
/**
85
* Return token as null-terminated string
86
*/
87
void
string(
char *
dst
,
size_t
max_len
)
const
{
88
strncpy(
dst,
start(
)
,
min(
len(
)
+
1,
max_len)
)
;
}
89
90
/**
91
* Return true if token is valid
92
*/
93
operator
bool
(
)
const
{
return
_start &&
_len
;
}
94
95
/**
96
* Access single characters of token
97
*/
98
char
operator
[]
(
int
idx
)
99
{
100
return
(
(
idx >=
0)
&&
(
(
unsigned
)
idx <
_len)
)
?
_start[idx]
: 0
;
101
}
102
103
/**
104
* Return next token
105
*/
106
Token
next(
)
const
{
return
Token(
_start +
_len,
_max_len -
_len)
;
}
107
108
/**
109
* Return next non-whitespace token
110
*/
111
Token
eat_whitespace(
)
const
{
return
(
_type(
_len)
==
WHITESPACE)
?
next(
)
: *
this
;
}
112
113
private
:
114
115
const
char *
_start;
116
size_t _max_len;
117
size_t _len;
118
119
/**
120
* Return type of token
121
*
122
* \param max_len maximum token length
123
*
124
* This method is used during the construction of `Token`
125
* objects, in particular for determining the value of the `_len`
126
* member. Therefore, we explicitely pass the `max_len` to the
127
* method. For the public interface, there exists the `type()`
128
* accessor, which relies on `_len` as implicit argument.
129
*/
130
Type
_type(
size_t
max_len
)
const
131
{
132
if
(
!
_start ||
max_len <
1 ||
!
*
_start)
return
END
;
133
134
/* determine the type based on the first character */
135
char c =
*
_start;
136
if
(
SCANNER_POLICY::
identifier_char(
c,
0)
)
return
IDENT
;
137
if
(
is_digit(
c)
)
return
NUMBER
;
138
if
(
is_whitespace(
c)
)
return
WHITESPACE
;
139
140
/* if string is incomplete, discard it (type END) */
141
if
(
c ==
`"`
)
142
return
_quoted_string_len(
max_len)
?
STRING : END
;
143
144
return
SINGLECHAR
;
145
}
146
147
size_t
_quoted_string_len(
size_t
max_len
)
const
148
{
149
unsigned
i =
0;
150
151
for
(
; !
end_of_quote(
&
_start[i])
&&
i <
max_len; i++
)
152
153
/* string ends without final quotation mark? too bad! */
154
if
(
!
_start[i])
return
0
;
155
156
/* exceeded maximum token length */
157
if
(
i ==
max_len)
return
0
;
158
159
/*
160
* We stopped our search at the character before the
161
* final quotation mark but we return the number of
162
* characters including the quotation marks.
163
*/
164
return
i +
2
;
165
}
166
167
/**
168
* Return length of token
169
*/
170
int
_calc_len(
size_t
max_len
)
const
171
{
172
switch
(
_type(
max_len)
)
{
173
174
case
SINGLECHAR:
175
return
1
;
176
177
case
NUMBER:
178
{
179
unsigned
i =
0;
180
for
(
; i <
max_len &&
is_digit(
_start[i])
; i++
)
;
181
return
i
;
182
}
183
184
case
IDENT:
185
{
186
unsigned
i =
0;
187
for
(
; i <
max_len; i++
)
{
188
if
(
SCANNER_POLICY::
identifier_char(
_start[i],
i)
)
189
continue
;
190
191
/* stop if any other (invalid) character occurs */
192
break;
193
}
194
return
i
;
195
}
196
197
case
STRING:
198
199
return
_quoted_string_len(
max_len)
;
200
201
case
WHITESPACE:
202
{
203
unsigned
i =
0;
204
for
(
; is_whitespace(
_start[i])
&&
i <
max_len; i++
)
;
205
return
i
;
206
}
207
208
case
END:
209
default
:
210
return
0
;
211
}
212
}
213
}
;
214
215
#
endif /* _INCLUDE__UTIL__TOKEN_H_ */