ktoken.c - klisp - an open source interpreter for the Kernel Programming Language.

ktoken.c (40482B)
      1 /*
      2 ** ktoken.c
      3 ** Tokenizer for the Kernel Programming Language
      4 ** See Copyright Notice in klisp.h
      5 */
      6 
      7 /*
      8 ** TODO:
      9 **
     10 ** - Support for complete number syntax (complex) (report)
     11 ** - Support for unicode (strings, char and symbols).
     12 **
     13 */
     14 #include <stdio.h>
     15 #include <stdlib.h>
     16 #include <string.h>
     17 #include <ctype.h>
     18 #include <stdint.h>
     19 #include <stdbool.h>
     20 
     21 #include "ktoken.h"
     22 #include "kobject.h"
     23 #include "kstate.h"
     24 #include "kinteger.h"
     25 #include "krational.h"
     26 #include "kreal.h"
     27 #include "kpair.h"
     28 #include "kstring.h"
     29 #include "kbytevector.h"
     30 #include "ksymbol.h"
     31 #include "kkeyword.h"
     32 #include "kerror.h"
     33 #include "kport.h"
     34 
     35 /*
     36 ** Char sets for fast ASCII char classification
     37 */
     38 
     39 /*
     40 ** Char set function/macro interface
     41 */
     42 void kcharset_empty(kcharset);
     43 void kcharset_fill(kcharset, char *);
     44 void kcharset_union(kcharset, kcharset);
     45 /* contains in .h */
     46     
     47 void kcharset_empty(kcharset chs)
     48 {
     49     for (int i = 0; i < 8; i++) {
     50         chs[i] = 0;
     51     }
     52 }
     53 
     54 void kcharset_fill(kcharset chs, char *chars_)
     55 {
     56     unsigned char *chars = (unsigned char *) chars_;
     57     unsigned char ch;
     58 
     59     kcharset_empty(chs);
     60 
     61     while ((ch = *chars++)) {
     62         chs[KCHS_OCTANT(ch)] |= KCHS_BIT(ch);
     63     }
     64 }
     65 
     66 void kcharset_union(kcharset chs, kcharset chs2)
     67 {
     68     for (int i = 0; i < 8; i++) {
     69         chs[i] |= chs2[i];
     70     }
     71 }
     72 
     73 /*
     74 ** Character sets for classification
     75 */
     76 kcharset ktok_alphabetic, ktok_numeric, ktok_whitespace;
     77 kcharset ktok_delimiter, ktok_extended;
     78 kcharset ktok_initial, ktok_subsequent;
     79 
     80 /*
     81 ** Special Tokens 
     82 **
     83 ** TEMP: defined in kstate.h
     84 **
     85 ** RATIONALE:
     86 **
     87 ** Because a pair is not a token, they can be used to represent special tokens
     88 ** instead of creating an otherwise useless special token type
     89 ** lparen, rparen and dot are represented as a pair with the corresponding 
     90 ** char in the car and nil in the cdr.
     91 ** srfi-38 tokens are also represented with a char in the car indicating if 
     92 ** it's a defining token ('=') or a referring token ('#') and the number in 
     93 ** the cdr. 
     94 ** The sexp comment token with a ';' in the car.
     95 ** This way a special token can be easily tested for (with ttispair)
     96 ** and easily classified (with switch(chvalue(kcar(tok)))).
     97 **
     98 */
     99 
    100 void ktok_init(klisp_State *K)
    101 {
    102     /* Character sets */
    103     kcharset_fill(ktok_alphabetic, "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    104                   "abcdefghijklmnopqrstuvwxyz");
    105     kcharset_fill(ktok_numeric, "0123456789");
    106     /* keep synchronized with cases in main tokenizer switch */
    107     kcharset_fill(ktok_whitespace, " \t\v\r\n\f");
    108 
    109     kcharset_fill(ktok_delimiter, "()\";");
    110     kcharset_union(ktok_delimiter, ktok_whitespace);
    111 
    112     kcharset_fill(ktok_initial, "!$%&*./:<=>?@^_~");
    113     kcharset_union(ktok_initial, ktok_alphabetic);
    114 
    115     /* N.B. Unlike in scheme, kernel admits both '.' and 
    116        '@' as initial chars in identifiers, but doesn't allow
    117        '+' or '-'. There are 3 exceptions:
    118        both '+' and '-' alone are identifiers and '.' alone is
    119        not an identifier */
    120     kcharset_fill(ktok_extended, "+-");
    121 
    122     kcharset_empty(ktok_subsequent);
    123     kcharset_union(ktok_subsequent, ktok_initial);
    124     kcharset_union(ktok_subsequent, ktok_numeric);
    125     kcharset_union(ktok_subsequent, ktok_extended);
    126 }
    127 
    128 /*
    129 ** Error management
    130 */
    131 
    132 void clear_shared_dict(klisp_State *K)
    133 {
    134     K->shared_dict = KNIL;
    135 }
    136 
    137 #define ktok_error(K, str) ktok_error_g(K, str, false, KINERT)
    138 #define ktok_error_extra(K, str, extra) ktok_error_g(K, str, true, extra)
    139 
    140 void ktok_error_g(klisp_State *K, char *str, bool extra, TValue extra_value)
    141 {
    142     /* all cleaning is done in throw 
    143        (stacks, shared_dict, rooted objs) */
    144 
    145     /* save the last source code info on the port */
    146     kport_update_source_info(K->curr_port, K->ktok_source_info.line,
    147                              K->ktok_source_info.col);
    148 
    149     /* include the source info (and extra value if present) in the error */
    150     TValue irritants;
    151     if (extra) {
    152         krooted_tvs_push(K, extra_value); /* will be popped by throw */
    153         TValue si = ktok_get_source_info(K);
    154         krooted_tvs_push(K, si); /* will be popped by throw */
    155         irritants = klist_g(K, false, 2, si, extra_value);
    156     } else {
    157         irritants = ktok_get_source_info(K);
    158     }
    159     krooted_tvs_push(K, irritants); /* will be popped by throw */
    160     klispE_throw_with_irritants(K, str, irritants);
    161 }
    162 
    163 /*
    164 ** Underlying stream interface & source code location tracking
    165 */
    166 
    167 /* TODO/OPTIMIZE We should use buffering to shorten the 
    168    average code path to read each char */
    169 /* this reads one character from curr_port */
    170 int ktok_ggetc(klisp_State *K)
    171 {
    172     /* XXX when full unicode is used (uint32_t) a different way should
    173        be use to signal EOF */
    174 	          
    175     TValue port = K->curr_port;
    176     if (ttisfport(port)) {
    177         /* fport */
    178         FILE *file = kfport_file(port);
    179 
    180         /* LOCK: only a single lock should be acquired */
    181         klisp_unlock(K);
    182         int chi = getc(file);
    183         klisp_lock(K);
    184 
    185         if (chi == EOF) {
    186             /* NOTE: eof doesn't change source code location info */
    187             if (ferror(file) != 0) {
    188                 /* clear error marker to allow retries later */
    189                 clearerr(file);
    190                 /* TODO put error info on the error obj */
    191                 ktok_error(K, "reading error");
    192                 return 0;
    193             } else { /* if (feof(file) != 0) */
    194                 /* let the eof marker set */
    195                 K->ktok_seen_eof = true;
    196                 return EOF;
    197             }
    198         } else 
    199             return chi;
    200     } else {
    201         /* mport */
    202         if (kport_is_binary(port)) {
    203             /* bytevector port */
    204             if (kmport_off(port) >= kbytevector_size(kmport_buf(port))) {
    205                 K->ktok_seen_eof = true;
    206                 return EOF;
    207             }
    208             int chi = kbytevector_buf(kmport_buf(port))[kmport_off(port)];
    209             ++kmport_off(port);
    210             return chi;
    211         } else {
    212             /* string port */
    213             if (kmport_off(port) >= kstring_size(kmport_buf(port))) {
    214                 K->ktok_seen_eof = true;
    215                 return EOF;
    216             }
    217             int chi = kstring_buf(kmport_buf(port))[kmport_off(port)];
    218             ++kmport_off(port);
    219             return chi;
    220         }
    221     }
    222 }
    223 
    224 /* this returns one character to curr_port */
    225 void ktok_gungetc(klisp_State *K, int chi)
    226 {
    227     if (chi == EOF)
    228         return;
    229 
    230     TValue port = K->curr_port;
    231     if (ttisfport(port)) {
    232         /* fport */
    233         FILE *file = kfport_file(port);
    234 
    235         if (ungetc(chi, file) == EOF) {
    236             if (ferror(file) != 0) {
    237                 /* clear error marker to allow retries later */
    238                 clearerr(file);
    239             }
    240             /* TODO put error info on the error obj */
    241             ktok_error(K, "reading error");
    242             return;
    243         }
    244     } else {
    245         /* mport */
    246         if (kport_is_binary(port)) {
    247             /* bytevector port */
    248             --kmport_off(port);
    249         } else {
    250             /* string port */
    251             --kmport_off(port);
    252         }
    253     }
    254 }
    255 
    256 int ktok_peekc_getc(klisp_State *K, bool peekp)
    257 {
    258     /* WORKAROUND: for stdin line buffering & reading of EOF, this flag
    259        is reset on every read */
    260     /* Otherwise, at least in linux, after reading or peeking an EOF from the 
    261        console, the next char isn't eof anymore */
    262     if (K->ktok_seen_eof)
    263         return EOF;
    264 
    265     int chi = ktok_ggetc(K);
    266 
    267     if (peekp) {
    268         ktok_gungetc(K, chi);
    269         return chi;
    270     }
    271 
    272     /* track source code location before returning the char */
    273     if (chi == '\t') {
    274         /* align column to next tab stop */
    275         K->ktok_source_info.col = 
    276             (K->ktok_source_info.col + K->ktok_source_info.tab_width) -
    277             (K->ktok_source_info.col % K->ktok_source_info.tab_width);
    278     } else if (chi == '\n') {
    279         K->ktok_source_info.line++;
    280         K->ktok_source_info.col = 0;
    281     } else {
    282         K->ktok_source_info.col++;
    283     }
    284     return chi;
    285 }
    286 
    287 void ktok_save_source_info(klisp_State *K)
    288 {
    289     K->ktok_source_info.saved_line = K->ktok_source_info.line;
    290     K->ktok_source_info.saved_col = K->ktok_source_info.col;
    291 }
    292 
    293 TValue ktok_get_source_info(klisp_State *K)
    294 {
    295     /* TEMP: for now, lines and column names are fixints */
    296     TValue pos = kcons(K, i2tv(K->ktok_source_info.saved_line),
    297                        i2tv(K->ktok_source_info.saved_col));
    298     krooted_tvs_push(K, pos);
    299     /* the filename is rooted in the port */
    300     TValue res = kcons(K, K->ktok_source_info.filename, pos);
    301     krooted_tvs_pop(K);
    302     return res;
    303 }
    304 
    305 void ktok_set_source_info(klisp_State *K, TValue filename, int32_t line,
    306                           int32_t col)
    307 {
    308     K->ktok_source_info.filename = filename;
    309     K->ktok_source_info.line = line;
    310     K->ktok_source_info.col = col;
    311 }
    312 
    313 
    314 /*
    315 ** ktok_read_token() helpers
    316 */
    317 void ktok_ignore_whitespace(klisp_State *K);
    318 void ktok_ignore_single_line_comment(klisp_State *K);
    319 void ktok_ignore_multi_line_comment(klisp_State *K);
    320 bool ktok_check_delimiter(klisp_State *K);
    321 char ktok_read_hex_escape(klisp_State *K);
    322 TValue ktok_read_string(klisp_State *K);
    323 TValue ktok_read_special(klisp_State *K);
    324 TValue ktok_read_number(klisp_State *K, char *buf, int32_t len,
    325                         bool has_exactp, bool exactp, bool has_radixp, 
    326                         int32_t radix);
    327 TValue ktok_read_maybe_signed_numeric(klisp_State *K);
    328 TValue ktok_read_identifier_or_dot(klisp_State *K, bool keywordp);
    329 TValue ktok_read_bar_identifier(klisp_State *K, bool keywordp);
    330 int ktok_read_until_delimiter(klisp_State *K);
    331 
    332 /*
    333 ** Main tokenizer function
    334 */
    335 TValue ktok_read_token(klisp_State *K)
    336 {
    337     klisp_assert(ks_tbisempty(K));
    338 
    339     while(true) {
    340         /* save the source info in case a token starts here */
    341         ktok_save_source_info(K);
    342 
    343         int chi = ktok_peekc(K);
    344 
    345         switch(chi) {
    346         case EOF:
    347             ktok_getc(K);
    348             return KEOF;
    349         case ' ':
    350         case '\n':
    351         case '\r':
    352         case '\t':
    353         case '\v': 
    354         case '\f': /* Keep synchronized with whitespace chars */
    355             ktok_ignore_whitespace(K);
    356             continue;
    357         case ';':
    358             ktok_ignore_single_line_comment(K);
    359             continue;
    360         case '(':
    361             ktok_getc(K);
    362             return G(K)->ktok_lparen;
    363         case ')':
    364             ktok_getc(K);
    365             return G(K)->ktok_rparen;
    366         case '"':
    367             return ktok_read_string(K);
    368         case '|':
    369             return ktok_read_bar_identifier(K, false);
    370 /* TODO use read_until_delimiter in all these cases */
    371         case '#': {
    372             ktok_getc(K);
    373             chi = ktok_peekc(K);
    374             switch(chi) {
    375             case EOF:
    376                 ktok_error(K, "# constant is too short");
    377                 return KINERT; /* avoid warning */
    378             case '!': /* single line comment (alternative syntax) */
    379                 /* this handles the #! style script header too! */
    380                 ktok_ignore_single_line_comment(K);
    381                 continue;
    382             case '|': /* nested/multiline comment */
    383                 ktok_getc(K); /* discard the '|' */
    384                 klisp_assert(K->ktok_nested_comments == 0);
    385                 K->ktok_nested_comments = 1;
    386                 ktok_ignore_multi_line_comment(K);
    387                 continue;
    388             case ';': /* sexp comment */
    389                 ktok_getc(K); /* discard the ';' */
    390                 return G(K)->ktok_sexp_comment;
    391             case ':': /* keyword */
    392                 ktok_getc(K); /* discard the ':' */
    393                 chi = ktok_peekc(K);
    394                 if (chi == EOF) {
    395                     ktok_error(K, "# constant is too short");
    396                     return KINERT; /* avoid warning */
    397                 } else if (chi == '|') {
    398                     return ktok_read_bar_identifier(K, true);
    399                 } else if (chi == '\\' || ktok_is_initial(chi)) {
    400                     return ktok_read_identifier_or_dot(K, true);
    401                 } else if (chi == '+' || chi == '-') {
    402                     char ch = (char) chi;
    403                     ktok_getc(K); /* discard the '+' or '-' */
    404                     if (ktok_check_delimiter(K)) {
    405                         return kkeyword_new_bs(K, &ch, 1);
    406                     } else {
    407                         ktok_error_extra(K, "invalid start in keyword", 
    408                                          ch2tv(ch));
    409                         return KINERT; /* avoid warning */
    410                     }
    411                 } else {
    412                     ktok_error_extra(K, "invalid char starting keyword",
    413                                      ch2tv((char) chi));
    414                     return KINERT; /* avoid warning */
    415                 }
    416             default:
    417                 return ktok_read_special(K);
    418             }
    419         }
    420         case '0': case '1': case '2': case '3': case '4': 
    421         case '5': case '6': case '7': case '8': case '9': {
    422             /* positive number, no exactness or radix indicator */
    423             int32_t buf_len = ktok_read_until_delimiter(K);
    424             char *buf = ks_tbget_buffer(K);
    425             /* read number should free the tbbuffer */
    426             return ktok_read_number(K, buf, buf_len, false, false, false, 10);
    427         }
    428         case '+': case '-':
    429             /* signed number, no exactness or radix indicator */
    430             return ktok_read_maybe_signed_numeric(K);
    431         case '\\': /* this is a symbol that starts with an hex escape */
    432             /* These should be kept synchronized with initial */
    433         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 
    434         case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': 
    435         case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': 
    436         case 'V': case 'W': case 'X': case 'Y': case 'Z': 
    437         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 
    438         case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 
    439         case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': 
    440         case 'v': case 'w': case 'x': case 'y': case 'z': 
    441         case '!': case '$': case '%': case '&': case '*': case '/': case ':': 
    442         case '<': case '=': case '>': case '?': case '@': case '^': case '_': 
    443         case '~': 
    444         case '.': /* this is either a symbol or a dot token */
    445             /*
    446             ** N.B.: the cases for '+', and '-', were already 
    447             ** considered 
    448             */
    449             return ktok_read_identifier_or_dot(K, false);
    450         default:
    451             chi = ktok_getc(K);
    452             ktok_error_extra(K, "unrecognized token starting char", 
    453                              ch2tv((char) chi));
    454             /* avoid warning */
    455             return KINERT;
    456         }
    457     }
    458 }
    459 
    460 /*
    461 ** Comments and Whitespace
    462 */
    463 void ktok_ignore_single_line_comment(klisp_State *K)
    464 {
    465     int chi;
    466     do {
    467         chi = ktok_getc(K);
    468     } while (chi != EOF && chi != '\n');
    469 }
    470 
    471 void ktok_ignore_multi_line_comment(klisp_State *K)
    472 {
    473     /* the first "#|' was already read */
    474     klisp_assert(K->ktok_nested_comments == 1);
    475     int chi;
    476     TValue last_nested_comment_si = ktok_get_source_info(K);
    477     krooted_vars_push(K, &last_nested_comment_si);
    478     ks_spush(K, KNIL);
    479 
    480     while(K->ktok_nested_comments > 0) {
    481         chi = ktok_peekc(K);
    482         while (chi != EOF && chi != '|' && chi != '#') {
    483             UNUSED(ktok_getc(K));
    484             chi = ktok_peekc(K);
    485         }
    486         if (chi == EOF)
    487             goto eof_error;
    488 
    489         char first_char = (char) chi;
    490 
    491         /* this first char will actually be the same just peeked, that's no
    492            problem, it will save the source info the first time around the 
    493            loop */
    494         chi = ktok_peekc(K);
    495         while (chi != EOF && chi == first_char) {
    496             ktok_save_source_info(K);
    497             UNUSED(ktok_getc(K));
    498             chi = ktok_peekc(K);
    499         }
    500         if (chi == EOF)
    501             goto eof_error;
    502 
    503         UNUSED(ktok_getc(K));
    504 
    505         if (chi == '#') {
    506             /* close comment (first char was '|', so the seq is "|#") */
    507             --K->ktok_nested_comments;
    508             last_nested_comment_si = ks_spop(K);
    509         } else if (chi == '|') {
    510             /* open comment (first char was '#', so the seq is "#|") */
    511             klisp_assert(K->ktok_nested_comments < 1000);
    512             ++K->ktok_nested_comments;
    513             ks_spush(K, last_nested_comment_si);
    514             last_nested_comment_si = ktok_get_source_info(K);
    515         } 
    516         /* else lone '#' or '|', just continue */
    517     }
    518     krooted_vars_pop(K);
    519     return;
    520 eof_error:
    521     K->ktok_nested_comments = 0;
    522     ktok_save_source_info(K);
    523     UNUSED(ktok_getc(K));
    524     krooted_vars_pop(K);
    525     ktok_error_extra(K, "unterminated multi line comment", last_nested_comment_si);
    526 }
    527 
    528 void ktok_ignore_whitespace(klisp_State *K)
    529 {
    530     /* NOTE: if it's not whitespace do nothing (even on eof) */
    531     while(true) {
    532         int chi = ktok_peekc(K);
    533 
    534         if (chi == EOF) {
    535             return;
    536         } else {
    537             char ch = (char) chi;
    538             if (ktok_is_whitespace(ch)) {
    539                 ktok_getc(K);
    540             } else {
    541                 return;
    542             }
    543         }
    544     }
    545 }
    546 
    547 /*
    548 ** Delimiter checking
    549 */
    550 bool ktok_check_delimiter(klisp_State *K)
    551 {
    552     int chi = ktok_peekc(K);
    553     return (ktok_is_delimiter(chi));
    554 }
    555 
    556 /*
    557 ** Returns the number of bytes read
    558 */
    559 int32_t ktok_read_until_delimiter(klisp_State *K)
    560 {
    561     int i = 0;
    562 
    563     while (!ktok_check_delimiter(K)) {
    564         /* NOTE: can't be eof, because eof is a delimiter */
    565         char ch = (char) ktok_getc(K);
    566         ks_tbadd(K, ch);
    567         i++;
    568     }
    569     ks_tbadd(K, '\0');
    570     return i;
    571 }
    572 
    573 /*
    574 ** Numbers
    575 ** TEMP: for now, only integers & rationals
    576 ** The digits are in buf, that must be freed after use,
    577 ** len should be at least one 
    578 */
    579 TValue ktok_read_number(klisp_State *K, char *buf, int32_t len, 
    580                         bool has_exactp, bool exactp, bool has_radixp, 
    581                         int32_t radix)
    582 {
    583     UNUSED(len); /* not needed really, buf ends with '\0' */
    584     TValue n;
    585     if (radix == 10) {
    586         /* only allow decimals with radix 10 */
    587         bool decimalp = false;
    588         if (!krational_read_decimal(K, buf, radix, &n, NULL, &decimalp)) {
    589             /* TODO throw meaningful error msgs, use last param */
    590             ktok_error(K, "Bad format in number");
    591             return KINERT;
    592         }
    593         if (decimalp && !has_exactp) {
    594             /* handle decimal format as an explicit #i */
    595             has_exactp = true;
    596             exactp = false;
    597         }
    598     } else {
    599         if (!krational_read(K, buf, radix, &n, NULL)) {
    600             /* TODO throw meaningful error msgs, use last param */
    601             ktok_error(K, "Bad format in number");
    602             return KINERT;
    603         }
    604     }
    605     ks_tbclear(K);
    606     
    607     if (has_exactp && !exactp) {
    608         krooted_tvs_push(K, n);
    609         n = kexact_to_inexact(K, n);
    610         krooted_tvs_pop(K);
    611     }
    612     return n;
    613 }
    614 
    615 TValue ktok_read_maybe_signed_numeric(klisp_State *K)
    616 {
    617     /* NOTE: can't be eof, it's either '+' or '-' */
    618     char ch = (char) ktok_getc(K);
    619     if (ktok_check_delimiter(K)) {
    620         ks_tbadd(K, ch);
    621         ks_tbadd(K, '\0');
    622         /* save the source info in the symbol */
    623         TValue si = ktok_get_source_info(K);
    624         krooted_tvs_push(K, si); /* will be popped by throw */
    625         TValue new_sym = ksymbol_new_bs(K, ks_tbget_buffer(K), 1, si);
    626         krooted_tvs_pop(K); /* already in symbol */
    627         krooted_tvs_push(K, new_sym);
    628         ks_tbclear(K); /* this shouldn't cause gc, but just in case */
    629         krooted_tvs_pop(K);
    630         return new_sym;
    631     } else {
    632         ks_tbadd(K, ch);
    633         int32_t buf_len = ktok_read_until_delimiter(K)+1;
    634         char *buf = ks_tbget_buffer(K);
    635         /* no exactness or radix prefix, default radix: 10 */
    636         return ktok_read_number(K, buf, buf_len, false, false, false, 10);
    637     }
    638 }
    639 
    640 /*
    641 ** Hex escapes for strings and symbols
    642 ** "#\xXXXXXX;"
    643 ** "#\x" already read
    644 */
    645 char ktok_read_hex_escape(klisp_State *K)
    646 {
    647     /* enough space for any unicode char + 2 */
    648     int ch;
    649     char buf[10];
    650     int c = 0;
    651     bool at_least_onep = false;
    652     for(ch = ktok_getc(K); ch != EOF && ch != ';'; 
    653         ch = ktok_getc(K)) {
    654         if (!ktok_is_digit(ch, 16)) {
    655             ktok_error_extra(K, "Invalid char found in hex escape", 
    656                              ch2tv(ch));
    657             return '\0'; /* avoid warning */
    658         } 
    659         /* 
    660         ** This will allow one space for '\0' and one extra
    661         ** char in case the value is too big, and so will 
    662         ** naturally result in a value outside the unicode
    663         ** range without the need to record any extra 
    664         ** characters other than the first 8 (without 
    665         ** leading zeroes).
    666         */
    667         at_least_onep = true;
    668         if (c < sizeof(buf) - 1 && (c > 0 || ch != '0')) 
    669             buf[c++] = ch;
    670     }
    671     if (ch == EOF) {
    672         ktok_error(K, "EOF found while reading hex escape");
    673         return '\0'; /* avoid warning */
    674     } else if (!at_least_onep) {
    675         ktok_error(K, "Empty hex escape found");
    676         return '\0'; /* avoid warning */
    677     } else if (c == 0) { /* this is the case of a NULL char */
    678         buf[c++] = '0'; 
    679     }
    680     buf[c++] = '\0';
    681     /* buf now contains the hex value of the char */
    682     TValue n;
    683     int res = kinteger_read(K, buf, 16, &n, NULL);
    684     /* can't fail, all digits were checked already */
    685     klisp_assert(res == true);
    686     if (!ttisfixint(n) || ivalue(n) > 127) {
    687         krooted_tvs_push(K, n);
    688         ktok_error_extra(K, "hex escaped char out of ASCII range", n);
    689         return '\0'; /* avoid warning */
    690     }
    691     /* all ok, we pass the char */
    692     return (char) ivalue(n);
    693 }
    694 
    695 /*
    696 ** Strings
    697 */
    698 TValue ktok_read_string(klisp_State *K)
    699 {
    700     /* discard opening quote */
    701     ktok_getc(K);
    702 
    703     bool done = false;
    704     int i = 0;
    705 
    706     while(!done) {
    707         int ch = ktok_getc(K);
    708     just_read: /* this comes from escaped newline */
    709         if (ch == EOF) {
    710             ktok_error(K, "EOF found while reading a string");
    711             return KINERT; /* avoid warning */
    712         } else 	if (ch < 0 || ch > 127) {
    713             ktok_error(K, "Non ASCII char found while reading a string");
    714             return KINERT; /* avoid warning */
    715         }
    716 
    717 
    718         if (ch == '"') {
    719             ks_tbadd(K, '\0');
    720             done = true;
    721         } else if (ch == '\\') {
    722             ch = ktok_getc(K);
    723 	
    724             if (ch == EOF) {
    725                 ktok_error(K, "EOF found while reading a string");
    726                 return KINERT; /* avoid warning */
    727             }
    728 
    729             switch(ch) {
    730                 /* These two will self insert */
    731             case '"':
    732             case '\\':
    733                 break;
    734                 /* These are naming chars (like in c, mostly) */
    735             case '0':
    736                 ch = '\0';
    737                 break;
    738             case 'a':
    739                 ch = '\a';
    740                 break;
    741             case 'b':
    742                 ch = '\b';
    743                 break;
    744             case 't':
    745                 ch = '\t';
    746                 break;
    747             case 'n':
    748                 ch = '\n';
    749                 break;
    750             case 'r':
    751                 ch = '\r';
    752                 break;
    753             case 'v':
    754                 ch = '\v';
    755                 break;
    756             case 'f':
    757                 ch = '\f';
    758                 break;
    759                 /* 
    760                 ** These signal an escaped newline (not included in string)
    761                 */
    762             case ' ':
    763             case '\t':
    764                 /* eat up all intraline spacing */
    765                 while((ch = ktok_getc(K)) != EOF &&
    766                       (ch == ' ' || ch == '\t'))
    767                     ;
    768                 if (ch == EOF) {
    769                     ktok_error(K, "EOF found while reading a string");
    770                     return KINERT; /* avoid warning */
    771                 } else if (ch != '\n' && ch != '\r') {
    772                     ktok_error(K, "Invalid char found after \\ while "
    773                                "reading a string");
    774                     return KINERT; /* avoid warning */
    775                 }
    776                 /* fall through */
    777             case '\n': 
    778             case '\r':
    779                 /* use the r6rs definition for line end */
    780                 if (ch == 'r') {
    781                     ch = ktok_peekc(K);
    782                     if (ch != EOF && ch == '\n')
    783                         ktok_getc(K);
    784                 }
    785                 /* eat up all intraline spacing */
    786                 while((ch = ktok_getc(K)) != EOF &&
    787                       (ch == ' ' || ch == '\t'))
    788                     ;
    789                 /* this will check for EOF and continue reading the 
    790                    string at the top of the loop */
    791                 goto just_read;
    792                 /* This is an hex escaped char */
    793             case 'x': 
    794                 ch = ktok_read_hex_escape(K);
    795                 break;
    796             default:
    797                 ktok_error_extra(K, "Invalid char after '\\' " 
    798                                  "while reading a string", ch2tv(ch));
    799                 return KINERT; /* avoid warning */
    800             }
    801             ks_tbadd(K, ch);
    802             ++i;
    803         } else { 
    804             ks_tbadd(K, ch);
    805             ++i;
    806         }
    807     }
    808     /* TEMP: for now strings "read" are mutable but strings "loaded" are
    809        not */
    810     TValue new_str = kstring_new_bs_g(K, K->read_mconsp, 
    811                                       ks_tbget_buffer(K), i); 
    812     krooted_tvs_push(K, new_str);
    813     ks_tbclear(K); /* shouldn't cause gc, but still */
    814     krooted_tvs_pop(K);
    815     return new_str;
    816 }
    817 
    818 /*
    819 ** Special constants (starting with "#")
    820 ** (Special number syntax, char constants, #ignore, #inert, srfi-38 tokens) 
    821 */
    822 
    823 /* this include the named chars as a subcase */
    824 struct kspecial_token {
    825     const char *ext_rep; /* downcase external representation */
    826     TValue obj;
    827 } kspecial_tokens[] = { { "#t", KTRUE_ },
    828                         { "#f", KFALSE_ },
    829                         { "#ignore", KIGNORE_ },
    830                         { "#inert", KINERT_ },
    831                         { "#e+infinity", KEPINF_ },
    832                         { "#e-infinity", KEMINF_ },
    833                         { "#i+infinity", KIPINF_ },
    834                         { "#i-infinity", KIMINF_ },
    835                         { "#real", KRWNPV_ },
    836                         { "#undefined", KUNDEF_ },
    837                         /* 
    838                         ** Character names 
    839                         ** (r7rs + vtab from r6rs) 
    840                         */
    841                         { "#\\null", KNULL_ },
    842                         { "#\\alarm", KALARM_ },
    843                         { "#\\backspace", KBACKSPACE_ },
    844                         { "#\\tab", KTAB_ },
    845                         { "#\\newline", KNEWLINE_ }, /* kernel */
    846                         { "#\\return", KRETURN_ },
    847                         { "#\\escape", KESCAPE_ },
    848                         { "#\\space", KSPACE_ }, /* kernel */
    849                         { "#\\delete", KDELETE_ },
    850                         { "#\\vtab", KVTAB_ }, /* r6rs, only */
    851                         { "#\\formfeed", KFORMFEED_ } /* r6rs in strings */
    852 }; 
    853 
    854 #define MAX_EXT_REP_SIZE 64  /* all special tokens have less than 64 chars */
    855 
    856 TValue ktok_read_special(klisp_State *K)
    857 {
    858     /* the # is already consumed, add it manually */
    859     ks_tbadd(K, '#');
    860     int32_t buf_len = ktok_read_until_delimiter(K) + 1;
    861     char *buf = ks_tbget_buffer(K);
    862 
    863     if (buf_len < 2) {
    864         /* we need at least one char in addition to the '#' */
    865         ktok_error(K, "# constant is too short");
    866         /* avoid warning */
    867         return KINERT;
    868     }
    869 
    870     /* first check that is not an output only representation, 
    871        they begin with '#[' and end with ']', but we know
    872        that buf[0] == '#' */
    873     if (buf_len > 2 && buf[1] == '[' && buf[buf_len-1] == ']') {
    874         ktok_error(K, "output only representation found");
    875         /* avoid warning */
    876         return KINERT;
    877     }
    878 
    879     /* Then check for simple chars, this is the only thing
    880        that is case dependant, so after this we downcase buf
    881        (except that an escaped char needs a small 'x' */
    882     /* REFACTOR: move this to a new function */
    883     /* char constant, needs at least 3 chars unless it's a delimiter
    884      * char! */
    885     if (buf_len == 2 && buf[1] == '\\') {
    886         /* was a delimiter char... read it */
    887         int ch_i = ktok_getc(K);
    888         if (ch_i == EOF) {
    889             ktok_error(K, "EOF found while reading character name");
    890             return KINERT; /* avoid warning */
    891         }
    892         ks_tbclear(K);
    893         return ch2tv((char)ch_i);
    894     } else if (buf[1] == '\\') {
    895         /* 
    896         ** RATIONALE: in the scheme spec (R5RS) it says that only alphabetic 
    897         ** char constants need a delimiter to disambiguate the cases with 
    898         ** character names. It would be more consistent if all characters
    899         ** needed a delimiter (and is probably implied by the yet incomplete
    900         ** Kernel report (R-1RK))
    901         ** For now we follow the scheme report 
    902         */
    903         char ch = buf[2]; /* we know buf_len > 2 */
    904 
    905         if (ch < 0 || ch > 127) {
    906             ktok_error(K, "Non ASCII char found as character constant");
    907             /* avoid warning */
    908             return KINERT;
    909         } 
    910 
    911         if (!ktok_is_alphabetic(ch) || buf_len == 3) { /* simple char */
    912             ks_tbclear(K);
    913             return ch2tv(ch);
    914         }
    915 
    916         /* char names are a subcase of special tokens so this case
    917            will be handled later */
    918         /* fall through */
    919     }
    920 
    921     /* first save the third char, in case it's an hex escaped char
    922        (that should be a lowercase x)  */
    923     char saved_third = buf[2]; /* there's at least 2 chars, so in the worst
    924                                   case buf[2] is just '\0' */
    925 
    926     /* now, we ignore case in all remaining comparisons */
    927     size_t i = 0;
    928     for(char *str2 = buf; i < buf_len; ++str2, ++i)
    929         *str2 = tolower(*str2);
    930 
    931     /* REFACTOR: move this to a new function */
    932     /* then check the known constants (including named characters) */
    933     size_t stok_size = sizeof(kspecial_tokens) / 
    934         sizeof(struct kspecial_token);
    935     for (i = 0; i < stok_size; i++) {
    936         struct kspecial_token token = kspecial_tokens[i];
    937         /* NOTE: must check type because buf may contain embedded '\0's */
    938         if (buf_len == strlen(token.ext_rep) &&
    939             strcmp(token.ext_rep, buf) == 0) {
    940             ks_tbclear(K);
    941             return token.obj; 
    942         }
    943     }
    944 
    945     /* It wasn't a special token or named char, but it can still be a srfi-38
    946        token or a character escape */
    947 
    948     if (buf[1] == '\\') { /* this is to have a meaningful error msg */
    949         if (saved_third != 'x') { /* case is significant here, so
    950                                      we use the saved char */
    951             ktok_error(K, "Unrecognized character name");
    952             return KINERT;
    953         }
    954         /* We already checked that length != 3 (x is alphabetic), 
    955            so there's at least on more char */
    956         TValue n;
    957         char *end;
    958 
    959         /* test for - and + explicitly, becayse kinteger read would parse them
    960            without complaining (it will also parse spaces, but we read until 
    961            delimiter so... */
    962         if (buf[3] == '-' || buf[3] == '+' ||
    963             !kinteger_read(K, buf+3, 16, &n, &end) || 
    964             end - buf != buf_len) {
    965             ktok_error(K, "Bad char in hex escaped character constant");
    966             return KINERT;
    967         } else if (!ttisfixint(n) || ivalue(n) > 127) {
    968             ktok_error(K, "Non ASCII char found in hex escaped character constant");
    969             /* avoid warning */
    970             return KINERT;
    971         } else {
    972             /* all ok, we just clean up and return the char */
    973             ks_tbclear(K);
    974             return ch2tv(ivalue(n));
    975         }
    976     }
    977 
    978     /* REFACTOR: move this to a new function */
    979     /* It was not a special token so it must be either a srfi-38 style
    980        token, or a number. srfi-38 tokens are a '#' a 
    981        decimal number and end with a '=' or a '#' */
    982     if (buf_len > 2 && ktok_is_numeric(buf[1])) {
    983         /* NOTE: it's important to check is_numeric to avoid problems with 
    984            sign in kinteger_read */
    985         /* srfi-38 type token (can be either a def or ref) */
    986         /* TODO: lift this implementation restriction */
    987         /* IMPLEMENTATION RESTRICTION: only allow fixints in shared tokens */
    988         char ch = buf[buf_len-1]; /* remember last char */
    989         buf[buf_len-1] = '\0'; /* replace last char with 0 to read number */
    990 
    991         if (ch != '#' && ch != '=') {
    992             ktok_error(K, "Missing last char in srfi-38 token");
    993             return KINERT;
    994         } /* else buf[i] == '#' or '=' */
    995         TValue n;
    996         char *end;
    997         /* 10 is the radix for srfi-38 tokens, buf+1 to jump over the '#',
    998            end+1 to count the last char */
    999         /* N.B. buf+1 can't be + or -, we already tested numeric before */
   1000         if (!kinteger_read(K, buf+1, 10, &n, &end) || end+1 - buf != buf_len) {
   1001             ktok_error(K, "Bad char in srfi-38 token");
   1002             return KINERT;
   1003         } else if (!ttisfixint(n)) {
   1004             ktok_error(K, "IMP. RESTRICTION: shared token too big");
   1005             /* avoid warning */
   1006             return KINERT;
   1007         }
   1008         ks_tbclear(K);
   1009         /* GC: no need to root n, for now it's a fixint */
   1010         return kcons(K, ch2tv(ch), n);
   1011     }
   1012     
   1013     /* REFACTOR: move to new function */
   1014 
   1015     /* the only possibility left is that it is a number with
   1016        an exactness or radix refix */
   1017     bool has_exactp = false;
   1018     bool exactp = false;  /* the default exactness will depend on the format */
   1019     bool has_radixp = false;
   1020     int32_t radix = 10;
   1021 	
   1022     int32_t idx = 1;
   1023     while (idx < buf_len) {
   1024         char ch = buf[idx];
   1025         switch(ch) {
   1026         case 'i':
   1027         case 'e':
   1028             if (has_exactp) {
   1029                 ktok_error(K, "two exactness prefixes in number");
   1030                 return KINERT;
   1031             }
   1032             has_exactp = true;
   1033             exactp = (ch == 'e');
   1034             break;
   1035         case 'b': radix = 2; goto RADIX;
   1036         case 'o': radix = 8; goto RADIX;
   1037         case 'd': radix = 10; goto RADIX;
   1038         case 'x': radix = 16; goto RADIX;
   1039         RADIX: 
   1040             if (has_radixp) {
   1041                 ktok_error(K, "two radix prefixes in number");
   1042                 return KINERT;
   1043             }
   1044             has_radixp = true;
   1045             break;
   1046         default:
   1047             ktok_error(K, "unknown # constant or "
   1048                        "unexpected char in number after #");
   1049             /* avoid warning */
   1050             return KINERT;
   1051         }
   1052         ++idx;
   1053         if (idx == buf_len)
   1054             break;
   1055         ch = buf[idx];
   1056 
   1057         switch(ch) {
   1058         case '#': {
   1059             ++idx; /* get next exacness or radix prefix */
   1060             break;
   1061         }
   1062         case '0': case '1': case '2': case '3': case '4':
   1063         case '5': case '6': case '7': case '8': case '9': 
   1064         case 'a': case 'b': case 'c': case 'd': case 'e':
   1065         case 'f': case '+': case '-': { /* read the number */
   1066             if (idx == buf_len) {
   1067                 ktok_error(K, "no digits found in number");
   1068             } else {
   1069                 return ktok_read_number(K, buf+idx, buf_len - idx,
   1070                                         has_exactp, exactp, 
   1071                                         has_radixp, radix);
   1072             }
   1073         }
   1074         default:
   1075             ktok_error(K, "unexpected char in number");
   1076             /* avoid warning */
   1077             return KINERT;
   1078         }
   1079     }
   1080     /* this means that the number wasn't found after the prefixes */
   1081     ktok_error(K, "no digits found in number");
   1082     /* avoid warning */
   1083     return KINERT;
   1084 }
   1085 
   1086 /*
   1087 ** Identifiers & Keywords (and dot token)
   1088 */
   1089 TValue ktok_read_identifier_or_dot(klisp_State *K, bool keywordp)
   1090 {
   1091     bool seen_dot = false;
   1092     int32_t i = 0;
   1093     while (!ktok_check_delimiter(K)) {
   1094         /* NOTE: can't be eof, because eof is a delimiter */
   1095         char ch = (char) ktok_getc(K);
   1096         /* this is needed to differentiate a dot from an equivalent escape */
   1097         seen_dot |= ch == '.';
   1098         /* NOTE: is_subsequent of '\0' is false, so no embedded '\0' */
   1099         if (ktok_is_subsequent(ch)) {
   1100             /* downcase all non-escaped chars */
   1101             ks_tbadd(K, tolower(ch));
   1102             ++i;
   1103         } else if (ch == '\\') {
   1104             /* should be inline hex escape */
   1105             ch = ktok_getc(K);
   1106             if (ch == EOF) {
   1107                 ktok_error(K, "EOF found while reading character escape");
   1108             } else if (ch != 'x') {
   1109                 ktok_error_extra(K, keywordp? 
   1110                                  "Invalid char after \\ in keyword" :
   1111                                  "Invalid char after \\ in identifier", 
   1112                                  ch2tv((char)ch));
   1113             }
   1114             ch = ktok_read_hex_escape(K);
   1115             /* don't downcase escaped chars */
   1116             ks_tbadd(K, ch);
   1117             ++i;
   1118         } else {
   1119             ktok_error_extra(K, keywordp? "Invalid char in keyword" :
   1120                              "Invalid char in identifier", ch2tv((char)ch));
   1121         }
   1122     }
   1123 
   1124     if (i == 1 && seen_dot) {
   1125         if (keywordp) {
   1126             ktok_error(K, "Invalid syntax in keyword");
   1127             return KINERT; /* avoid warning */
   1128         } else {
   1129             ks_tbclear(K);
   1130             return G(K)->ktok_dot;
   1131         }
   1132     }
   1133 
   1134     ks_tbadd(K, '\0');
   1135     TValue new_obj;
   1136     if (keywordp) {
   1137         new_obj = kkeyword_new_bs(K, ks_tbget_buffer(K), i);
   1138     } else {
   1139         TValue si = ktok_get_source_info(K);
   1140         krooted_tvs_push(K, si); /* will be popped by throw */
   1141         new_obj = ksymbol_new_bs(K, ks_tbget_buffer(K), i, si);
   1142         krooted_tvs_pop(K); /* already in symbol */
   1143     }
   1144     krooted_tvs_push(K, new_obj);
   1145     ks_tbclear(K); /* this shouldn't cause gc, but just in case */
   1146     krooted_tvs_pop(K);
   1147     return new_obj;
   1148 }
   1149 
   1150 TValue ktok_read_bar_identifier(klisp_State *K, bool keywordp)
   1151 {
   1152     /* discard opening bar */
   1153     ktok_getc(K);
   1154 
   1155     bool done = false;
   1156     int i = 0;
   1157 
   1158     /* Never downcase chars in |...| escaped symbols */
   1159     while(!done) {
   1160         int ch = ktok_getc(K);
   1161         if (ch == EOF) {
   1162             ktok_error(K, keywordp? 
   1163                        "EOF found while reading a #:|keyword|" :
   1164                        "EOF found while reading an |identifier|");
   1165             return KINERT; /* avoid warning */
   1166         } else 	if (ch < 0 || ch > 127) {
   1167             ktok_error(K, keywordp? 
   1168                        "Non ASCII char found while reading a #:|keyword|" :
   1169                        "Non ASCII char found while reading an |identifier|");
   1170             return KINERT; /* avoid warning */
   1171         }
   1172 
   1173         if (ch == '|') {
   1174             ks_tbadd(K, '\0');
   1175             done = true;
   1176         } else if (ch == '\\') {
   1177             ch = ktok_getc(K);
   1178 	
   1179             if (ch == EOF) {
   1180                 ktok_error(K, keywordp? 
   1181                            "EOF found while reading a #:|keyword|" :
   1182                            "EOF found while reading an |identifier|");
   1183                 return KINERT; /* avoid warning */
   1184             }
   1185 
   1186             switch(ch) {
   1187                 /* These two will self insert */
   1188             case '|':
   1189             case '\\':
   1190                 break;
   1191             case 'x':
   1192                 ch = ktok_read_hex_escape(K);
   1193                 break;
   1194             default:
   1195                 ktok_error_extra(K,  keywordp? 
   1196                                  "Invalid char after '\\' while reading a "
   1197                                  "#:|keyword|" :
   1198                                  "Invalid char after '\\' while reading an "
   1199                                  "|identifier|", ch2tv(ch));
   1200                 return KINERT; /* avoid warning */
   1201             }
   1202             ks_tbadd(K, ch);
   1203             ++i;
   1204         } else { 
   1205             ks_tbadd(K, ch);
   1206             ++i;
   1207         }
   1208     }
   1209     TValue new_obj;
   1210     if (keywordp) {
   1211         new_obj = kkeyword_new_bs(K, ks_tbget_buffer(K), i);
   1212     } else {
   1213         TValue si = ktok_get_source_info(K);
   1214         krooted_tvs_push(K, si); /* will be popped by throw */
   1215         new_obj = ksymbol_new_bs(K, ks_tbget_buffer(K), i, si);
   1216         krooted_tvs_pop(K); /* already in symbol */
   1217     }
   1218     krooted_tvs_push(K, new_obj);
   1219     ks_tbclear(K); /* this shouldn't cause gc, but just in case */
   1220     krooted_tvs_pop(K);
   1221     return new_obj;
   1222 }
   1223
	klisp an open source interpreter for the Kernel Programming Language.
	git clone http://git.hanabi.in/repos/klisp.git
	Log \| Files \| Refs \| README