1 /*
2 * \brief Tokenizer support
3 * \author Norman Feske
4 * \date 2006-05-19
5 */
6
7 /*
8 * Copyright (C) 2006-2013 Genode Labs GmbH
9 *
10 * This file is part of the Genode OS framework, which is distributed
11 * under the terms of the GNU General Public License version 2.
12 */
13
14 #ifndef _INCLUDE__UTIL__TOKEN_H_
15 #define _INCLUDE__UTIL__TOKEN_H_
16
17 #include <util/string.h>
18
19 namespace Genode {
20
21 struct Scanner_policy_identifier_with_underline;
22 template <typename> class Token;
23 }
24
25
26 /**
27 * Scanner policy that accepts underline characters in identifiers
28 */
29 struct Genode::Scanner_policy_identifier_with_underline
30 {
31 /**
32 * Return true if character belongs to a valid identifier
33 *
34 * \param c character
35 * \param i index of character in token
36 * \return true if character is a valid identifier character
37 *
38 * Letters and underline characters are allowed anywhere in an
39 * identifier, digits must not appear at the beginning.
40 */
41 static bool identifier_char(char c, unsigned i) {
42 return is_letter(c) || (c == `_`) || (i && is_digit(c)); }
43 };
44
45
46 /**
47 * Token
48 *
49 * This class is used to group characters of a string which belong
50 * to one syntactical token types number, identifier, string,
51 * whitespace or another single character.
52 *
53 * \param SCANNER_POLICY policy that defines the way of token scanning
54 *
55 * See `Scanner_policy_identifier_with_underline` for an example scanner
56 * policy.
57 */
58 template <typename SCANNER_POLICY>
59 class Genode::Token
60 {
61 public:
62
63 enum Type { SINGLECHAR, NUMBER, IDENT, STRING, WHITESPACE, END };
64
65 /**
66 * Constructor
67 *
68 * \param s start of string to construct a token from
69 * \param max_len maximum token length
70 *
71 * The `max_len` argument is useful for processing character arrays
72 * that are not null-terminated.
73 */
74 Token(const char *s = 0, size_t max_len = ~0UL)
75 : _start(s), _max_len(max_len), _len(s ? _calc_len(max_len) : 0) { }
76
77 /**
78 * Accessors
79 */
80 char *start() const { return (char *)_start; }
81 size_t len() const { return _len; }
82 Type type() const { return _type(_len); }
83
84 /**
85 * Return token as null-terminated string
86 */
87 void string(char *dst, size_t max_len) const {
88 strncpy(dst, start(), min(len() + 1, max_len)); }
89
90 /**
91 * Return true if token is valid
92 */
93 operator bool () const { return _start && _len; }
94
95 /**
96 * Access single characters of token
97 */
98 char operator [] (int idx)
99 {
100 return ((idx >= 0) && ((unsigned)idx < _len)) ? _start[idx] : 0;
101 }
102
103 /**
104 * Return next token
105 */
106 Token next() const { return Token(_start + _len, _max_len - _len); }
107
108 /**
109 * Return next non-whitespace token
110 */
111 Token eat_whitespace() const { return (_type(_len) == WHITESPACE) ? next() : *this; }
112
113 private:
114
115 const char *_start;
116 size_t _max_len;
117 size_t _len;
118
119 /**
120 * Return type of token
121 *
122 * \param max_len maximum token length
123 *
124 * This method is used during the construction of `Token`
125 * objects, in particular for determining the value of the `_len`
126 * member. Therefore, we explicitely pass the `max_len` to the
127 * method. For the public interface, there exists the `type()`
128 * accessor, which relies on `_len` as implicit argument.
129 */
130 Type _type(size_t max_len) const
131 {
132 if (!_start || max_len < 1 || !*_start) return END;
133
134 /* determine the type based on the first character */
135 char c = *_start;
136 if (SCANNER_POLICY::identifier_char(c, 0)) return IDENT;
137 if (is_digit(c)) return NUMBER;
138 if (is_whitespace(c)) return WHITESPACE;
139
140 /* if string is incomplete, discard it (type END) */
141 if (c == `"`)
142 return _quoted_string_len(max_len) ? STRING : END;
143
144 return SINGLECHAR;
145 }
146
147 size_t _quoted_string_len(size_t max_len) const
148 {
149 unsigned i = 0;
150
151 for (; !end_of_quote(&_start[i]) && i < max_len; i++)
152
153 /* string ends without final quotation mark? too bad! */
154 if (!_start[i]) return 0;
155
156 /* exceeded maximum token length */
157 if (i == max_len) return 0;
158
159 /*
160 * We stopped our search at the character before the
161 * final quotation mark but we return the number of
162 * characters including the quotation marks.
163 */
164 return i + 2;
165 }
166
167 /**
168 * Return length of token
169 */
170 int _calc_len(size_t max_len) const
171 {
172 switch (_type(max_len)) {
173
174 case SINGLECHAR:
175 return 1;
176
177 case NUMBER:
178 {
179 unsigned i = 0;
180 for (; i < max_len && is_digit(_start[i]); i++);
181 return i;
182 }
183
184 case IDENT:
185 {
186 unsigned i = 0;
187 for (; i < max_len; i++) {
188 if (SCANNER_POLICY::identifier_char(_start[i], i))
189 continue;
190
191 /* stop if any other (invalid) character occurs */
192 break;
193 }
194 return i;
195 }
196
197 case STRING:
198
199 return _quoted_string_len(max_len);
200
201 case WHITESPACE:
202 {
203 unsigned i = 0;
204 for (; is_whitespace(_start[i]) && i < max_len; i++);
205 return i;
206 }
207
208 case END:
209 default:
210 return 0;
211 }
212 }
213 };
214
215 #endif /* _INCLUDE__UTIL__TOKEN_H_ */