/*

   2   * \brief  Tokenizer support

   3   * \author Norman Feske

   4   * \date   2006-05-19

   5   */

6
7

/*

   8   * Copyright (C) 2006-2013 Genode Labs GmbH

   9   *

  10   * This file is part of the Genode OS framework, which is distributed

  11   * under the terms of the GNU General Public License version 2.

  12   */

  13
  14  #ifndef _INCLUDE__UTIL__TOKEN_H_
  15  #define _INCLUDE__UTIL__TOKEN_H_
  16
  17  #include <util/string.h>
  18
  19

namespace Genode {

  20  

  21     struct Scanner_policy_identifier_with_underline;

  22     template <typename> class Token;

  23  }

  24
  25
  26

/**

  27   * Scanner policy that accepts underline characters in identifiers

  28   */

  29  struct Genode::Scanner_policy_identifier_with_underline

  30  {

  31     /**

  32      * Return true if character belongs to a valid identifier

  33      *

  34      * \param c  character

  35      * \param i  index of character in token

  36      * \return   true if character is a valid identifier character

  37      *

  38      * Letters and underline characters are allowed anywhere in an

  39      * identifier, digits must not appear at the beginning.

  40      */

  41     static bool identifier_char(char c, unsigned i) {

  42        return is_letter(c) || (c == `_`) || (i && is_digit(c)); }

  43  };

  44
  45
  46

/**

  47   * Token

  48   *

  49   * This class is used to group characters of a string which belong

  50   * to one syntactical token types number, identifier, string,

  51   * whitespace or another single character.

  52   *

  53   * \param SCANNER_POLICY  policy that defines the way of token scanning

  54   *

  55   * See `Scanner_policy_identifier_with_underline` for an example scanner

  56   * policy.

  57   */

  58  template <typename SCANNER_POLICY>

  59  class Genode::Token

  60  {

  61     public:

  62  

  63        enum Type { SINGLECHAR, NUMBER, IDENT, STRING, WHITESPACE, END };

  64  

  65        /**

  66         * Constructor

  67         *

  68         * \param s        start of string to construct a token from

  69         * \param max_len  maximum token length

  70         *

  71         * The `max_len` argument is useful for processing character arrays

  72         * that are not null-terminated.

  73         */

  74        Token(const char *s = 0, size_t max_len = ~0UL)

  75        : _start(s), _max_len(max_len), _len(s ? _calc_len(max_len) : 0) { }

  76  

  77        /**

  78         * Accessors

  79         */

  80        char *start() const { return (char *)_start; }

  81        size_t  len() const { return _len; }

  82        Type   type() const { return _type(_len); }

  83  

  84        /**

  85         * Return token as null-terminated string

  86         */

  87        void string(char *dst, size_t max_len) const {

  88           strncpy(dst, start(), min(len() + 1, max_len)); }

  89  

  90        /**

  91         * Return true if token is valid

  92         */

  93        operator bool () const { return _start && _len; }

  94  

  95        /**

  96         * Access single characters of token

  97         */

  98        char operator [] (int idx)

  99        {

 100           return ((idx >= 0) && ((unsigned)idx < _len)) ? _start[idx] : 0;

 101        }

 102  

 103        /**

 104         * Return next token

 105         */

 106        Token next() const { return Token(_start + _len, _max_len - _len); }

 107  

 108        /**

 109         * Return next non-whitespace token

 110         */

 111        Token eat_whitespace() const { return (_type(_len) == WHITESPACE) ? next() : *this; }

 112  

 113     private:

 114  

 115        const char *_start;

 116        size_t      _max_len;

 117        size_t      _len;

 118  

 119        /**

 120         * Return type of token

 121         *

 122         * \param  max_len  maximum token length

 123         *

 124         * This method is used during the construction of `Token`

 125         * objects, in particular for determining the value of the `_len`

 126         * member. Therefore, we explicitely pass the `max_len` to the

 127         * method. For the public interface, there exists the `type()`

 128         * accessor, which relies on `_len` as implicit argument.

 129         */

 130        Type _type(size_t max_len) const

 131        {

 132           if (!_start || max_len < 1 || !*_start) return END;

 133  

 134           /* determine the type based on the first character */

 135           char c = *_start;

 136           if (SCANNER_POLICY::identifier_char(c, 0)) return IDENT;

 137           if (is_digit(c))                           return NUMBER;

 138           if (is_whitespace(c))                      return WHITESPACE;

 139  

 140           /* if string is incomplete, discard it (type END) */

 141           if (c == `"`)

 142              return _quoted_string_len(max_len) ? STRING : END;

 143  

 144           return SINGLECHAR;

 145        }

 146  

 147        size_t _quoted_string_len(size_t max_len) const

 148        {

 149           unsigned i = 0;

 150  

 151           for (; !end_of_quote(&_start[i]) && i < max_len; i++)

 152  

 153              /* string ends without final quotation mark? too bad! */

 154              if (!_start[i]) return 0;

 155  

 156           /* exceeded maximum token length */

 157           if (i == max_len) return 0;

 158  

 159           /*

 160            * We stopped our search at the character before the

 161            * final quotation mark but we return the number of

 162            * characters including the quotation marks.

 163            */

 164           return i + 2;

 165        }

 166  

 167        /**

 168         * Return length of token

 169         */

 170        int _calc_len(size_t max_len) const

 171        {

 172           switch (_type(max_len)) {

 173  

 174           case SINGLECHAR:

 175              return 1;

 176  

 177           case NUMBER:

 178              {

 179                 unsigned i = 0;

 180                 for (; i < max_len && is_digit(_start[i]); i++);

 181                 return i;

 182              }

 183  

 184           case IDENT:

 185              {

 186                 unsigned i = 0;

 187                 for (; i < max_len; i++) {

 188                    if (SCANNER_POLICY::identifier_char(_start[i], i))

 189                       continue;

 190  

 191                    /* stop if any other (invalid) character occurs */

 192                    break;

 193                 }

 194                 return i;

 195              }

 196  

 197           case STRING:

 198  

 199              return _quoted_string_len(max_len);

 200  

 201           case WHITESPACE:

 202              {

 203                 unsigned i = 0;

 204                 for (; is_whitespace(_start[i]) && i < max_len; i++);

 205                 return i;

 206              }

 207  

 208           case END:

 209           default:

 210              return 0;

 211           }

 212        }

 213  };

214
215 #endif /* _INCLUDE__UTIL__TOKEN_H_ */