ktoken.c (40482B)
1 /* 2 ** ktoken.c 3 ** Tokenizer for the Kernel Programming Language 4 ** See Copyright Notice in klisp.h 5 */ 6 7 /* 8 ** TODO: 9 ** 10 ** - Support for complete number syntax (complex) (report) 11 ** - Support for unicode (strings, char and symbols). 12 ** 13 */ 14 #include <stdio.h> 15 #include <stdlib.h> 16 #include <string.h> 17 #include <ctype.h> 18 #include <stdint.h> 19 #include <stdbool.h> 20 21 #include "ktoken.h" 22 #include "kobject.h" 23 #include "kstate.h" 24 #include "kinteger.h" 25 #include "krational.h" 26 #include "kreal.h" 27 #include "kpair.h" 28 #include "kstring.h" 29 #include "kbytevector.h" 30 #include "ksymbol.h" 31 #include "kkeyword.h" 32 #include "kerror.h" 33 #include "kport.h" 34 35 /* 36 ** Char sets for fast ASCII char classification 37 */ 38 39 /* 40 ** Char set function/macro interface 41 */ 42 void kcharset_empty(kcharset); 43 void kcharset_fill(kcharset, char *); 44 void kcharset_union(kcharset, kcharset); 45 /* contains in .h */ 46 47 void kcharset_empty(kcharset chs) 48 { 49 for (int i = 0; i < 8; i++) { 50 chs[i] = 0; 51 } 52 } 53 54 void kcharset_fill(kcharset chs, char *chars_) 55 { 56 unsigned char *chars = (unsigned char *) chars_; 57 unsigned char ch; 58 59 kcharset_empty(chs); 60 61 while ((ch = *chars++)) { 62 chs[KCHS_OCTANT(ch)] |= KCHS_BIT(ch); 63 } 64 } 65 66 void kcharset_union(kcharset chs, kcharset chs2) 67 { 68 for (int i = 0; i < 8; i++) { 69 chs[i] |= chs2[i]; 70 } 71 } 72 73 /* 74 ** Character sets for classification 75 */ 76 kcharset ktok_alphabetic, ktok_numeric, ktok_whitespace; 77 kcharset ktok_delimiter, ktok_extended; 78 kcharset ktok_initial, ktok_subsequent; 79 80 /* 81 ** Special Tokens 82 ** 83 ** TEMP: defined in kstate.h 84 ** 85 ** RATIONALE: 86 ** 87 ** Because a pair is not a token, they can be used to represent special tokens 88 ** instead of creating an otherwise useless special token type 89 ** lparen, rparen and dot are represented as a pair with the corresponding 90 ** char in the car and nil in the cdr. 91 ** srfi-38 tokens are also represented with a char in the car indicating if 92 ** it's a defining token ('=') or a referring token ('#') and the number in 93 ** the cdr. 94 ** The sexp comment token with a ';' in the car. 95 ** This way a special token can be easily tested for (with ttispair) 96 ** and easily classified (with switch(chvalue(kcar(tok)))). 97 ** 98 */ 99 100 void ktok_init(klisp_State *K) 101 { 102 /* Character sets */ 103 kcharset_fill(ktok_alphabetic, "ABCDEFGHIJKLMNOPQRSTUVWXYZ" 104 "abcdefghijklmnopqrstuvwxyz"); 105 kcharset_fill(ktok_numeric, "0123456789"); 106 /* keep synchronized with cases in main tokenizer switch */ 107 kcharset_fill(ktok_whitespace, " \t\v\r\n\f"); 108 109 kcharset_fill(ktok_delimiter, "()\";"); 110 kcharset_union(ktok_delimiter, ktok_whitespace); 111 112 kcharset_fill(ktok_initial, "!$%&*./:<=>?@^_~"); 113 kcharset_union(ktok_initial, ktok_alphabetic); 114 115 /* N.B. Unlike in scheme, kernel admits both '.' and 116 '@' as initial chars in identifiers, but doesn't allow 117 '+' or '-'. There are 3 exceptions: 118 both '+' and '-' alone are identifiers and '.' alone is 119 not an identifier */ 120 kcharset_fill(ktok_extended, "+-"); 121 122 kcharset_empty(ktok_subsequent); 123 kcharset_union(ktok_subsequent, ktok_initial); 124 kcharset_union(ktok_subsequent, ktok_numeric); 125 kcharset_union(ktok_subsequent, ktok_extended); 126 } 127 128 /* 129 ** Error management 130 */ 131 132 void clear_shared_dict(klisp_State *K) 133 { 134 K->shared_dict = KNIL; 135 } 136 137 #define ktok_error(K, str) ktok_error_g(K, str, false, KINERT) 138 #define ktok_error_extra(K, str, extra) ktok_error_g(K, str, true, extra) 139 140 void ktok_error_g(klisp_State *K, char *str, bool extra, TValue extra_value) 141 { 142 /* all cleaning is done in throw 143 (stacks, shared_dict, rooted objs) */ 144 145 /* save the last source code info on the port */ 146 kport_update_source_info(K->curr_port, K->ktok_source_info.line, 147 K->ktok_source_info.col); 148 149 /* include the source info (and extra value if present) in the error */ 150 TValue irritants; 151 if (extra) { 152 krooted_tvs_push(K, extra_value); /* will be popped by throw */ 153 TValue si = ktok_get_source_info(K); 154 krooted_tvs_push(K, si); /* will be popped by throw */ 155 irritants = klist_g(K, false, 2, si, extra_value); 156 } else { 157 irritants = ktok_get_source_info(K); 158 } 159 krooted_tvs_push(K, irritants); /* will be popped by throw */ 160 klispE_throw_with_irritants(K, str, irritants); 161 } 162 163 /* 164 ** Underlying stream interface & source code location tracking 165 */ 166 167 /* TODO/OPTIMIZE We should use buffering to shorten the 168 average code path to read each char */ 169 /* this reads one character from curr_port */ 170 int ktok_ggetc(klisp_State *K) 171 { 172 /* XXX when full unicode is used (uint32_t) a different way should 173 be use to signal EOF */ 174 175 TValue port = K->curr_port; 176 if (ttisfport(port)) { 177 /* fport */ 178 FILE *file = kfport_file(port); 179 180 /* LOCK: only a single lock should be acquired */ 181 klisp_unlock(K); 182 int chi = getc(file); 183 klisp_lock(K); 184 185 if (chi == EOF) { 186 /* NOTE: eof doesn't change source code location info */ 187 if (ferror(file) != 0) { 188 /* clear error marker to allow retries later */ 189 clearerr(file); 190 /* TODO put error info on the error obj */ 191 ktok_error(K, "reading error"); 192 return 0; 193 } else { /* if (feof(file) != 0) */ 194 /* let the eof marker set */ 195 K->ktok_seen_eof = true; 196 return EOF; 197 } 198 } else 199 return chi; 200 } else { 201 /* mport */ 202 if (kport_is_binary(port)) { 203 /* bytevector port */ 204 if (kmport_off(port) >= kbytevector_size(kmport_buf(port))) { 205 K->ktok_seen_eof = true; 206 return EOF; 207 } 208 int chi = kbytevector_buf(kmport_buf(port))[kmport_off(port)]; 209 ++kmport_off(port); 210 return chi; 211 } else { 212 /* string port */ 213 if (kmport_off(port) >= kstring_size(kmport_buf(port))) { 214 K->ktok_seen_eof = true; 215 return EOF; 216 } 217 int chi = kstring_buf(kmport_buf(port))[kmport_off(port)]; 218 ++kmport_off(port); 219 return chi; 220 } 221 } 222 } 223 224 /* this returns one character to curr_port */ 225 void ktok_gungetc(klisp_State *K, int chi) 226 { 227 if (chi == EOF) 228 return; 229 230 TValue port = K->curr_port; 231 if (ttisfport(port)) { 232 /* fport */ 233 FILE *file = kfport_file(port); 234 235 if (ungetc(chi, file) == EOF) { 236 if (ferror(file) != 0) { 237 /* clear error marker to allow retries later */ 238 clearerr(file); 239 } 240 /* TODO put error info on the error obj */ 241 ktok_error(K, "reading error"); 242 return; 243 } 244 } else { 245 /* mport */ 246 if (kport_is_binary(port)) { 247 /* bytevector port */ 248 --kmport_off(port); 249 } else { 250 /* string port */ 251 --kmport_off(port); 252 } 253 } 254 } 255 256 int ktok_peekc_getc(klisp_State *K, bool peekp) 257 { 258 /* WORKAROUND: for stdin line buffering & reading of EOF, this flag 259 is reset on every read */ 260 /* Otherwise, at least in linux, after reading or peeking an EOF from the 261 console, the next char isn't eof anymore */ 262 if (K->ktok_seen_eof) 263 return EOF; 264 265 int chi = ktok_ggetc(K); 266 267 if (peekp) { 268 ktok_gungetc(K, chi); 269 return chi; 270 } 271 272 /* track source code location before returning the char */ 273 if (chi == '\t') { 274 /* align column to next tab stop */ 275 K->ktok_source_info.col = 276 (K->ktok_source_info.col + K->ktok_source_info.tab_width) - 277 (K->ktok_source_info.col % K->ktok_source_info.tab_width); 278 } else if (chi == '\n') { 279 K->ktok_source_info.line++; 280 K->ktok_source_info.col = 0; 281 } else { 282 K->ktok_source_info.col++; 283 } 284 return chi; 285 } 286 287 void ktok_save_source_info(klisp_State *K) 288 { 289 K->ktok_source_info.saved_line = K->ktok_source_info.line; 290 K->ktok_source_info.saved_col = K->ktok_source_info.col; 291 } 292 293 TValue ktok_get_source_info(klisp_State *K) 294 { 295 /* TEMP: for now, lines and column names are fixints */ 296 TValue pos = kcons(K, i2tv(K->ktok_source_info.saved_line), 297 i2tv(K->ktok_source_info.saved_col)); 298 krooted_tvs_push(K, pos); 299 /* the filename is rooted in the port */ 300 TValue res = kcons(K, K->ktok_source_info.filename, pos); 301 krooted_tvs_pop(K); 302 return res; 303 } 304 305 void ktok_set_source_info(klisp_State *K, TValue filename, int32_t line, 306 int32_t col) 307 { 308 K->ktok_source_info.filename = filename; 309 K->ktok_source_info.line = line; 310 K->ktok_source_info.col = col; 311 } 312 313 314 /* 315 ** ktok_read_token() helpers 316 */ 317 void ktok_ignore_whitespace(klisp_State *K); 318 void ktok_ignore_single_line_comment(klisp_State *K); 319 void ktok_ignore_multi_line_comment(klisp_State *K); 320 bool ktok_check_delimiter(klisp_State *K); 321 char ktok_read_hex_escape(klisp_State *K); 322 TValue ktok_read_string(klisp_State *K); 323 TValue ktok_read_special(klisp_State *K); 324 TValue ktok_read_number(klisp_State *K, char *buf, int32_t len, 325 bool has_exactp, bool exactp, bool has_radixp, 326 int32_t radix); 327 TValue ktok_read_maybe_signed_numeric(klisp_State *K); 328 TValue ktok_read_identifier_or_dot(klisp_State *K, bool keywordp); 329 TValue ktok_read_bar_identifier(klisp_State *K, bool keywordp); 330 int ktok_read_until_delimiter(klisp_State *K); 331 332 /* 333 ** Main tokenizer function 334 */ 335 TValue ktok_read_token(klisp_State *K) 336 { 337 klisp_assert(ks_tbisempty(K)); 338 339 while(true) { 340 /* save the source info in case a token starts here */ 341 ktok_save_source_info(K); 342 343 int chi = ktok_peekc(K); 344 345 switch(chi) { 346 case EOF: 347 ktok_getc(K); 348 return KEOF; 349 case ' ': 350 case '\n': 351 case '\r': 352 case '\t': 353 case '\v': 354 case '\f': /* Keep synchronized with whitespace chars */ 355 ktok_ignore_whitespace(K); 356 continue; 357 case ';': 358 ktok_ignore_single_line_comment(K); 359 continue; 360 case '(': 361 ktok_getc(K); 362 return G(K)->ktok_lparen; 363 case ')': 364 ktok_getc(K); 365 return G(K)->ktok_rparen; 366 case '"': 367 return ktok_read_string(K); 368 case '|': 369 return ktok_read_bar_identifier(K, false); 370 /* TODO use read_until_delimiter in all these cases */ 371 case '#': { 372 ktok_getc(K); 373 chi = ktok_peekc(K); 374 switch(chi) { 375 case EOF: 376 ktok_error(K, "# constant is too short"); 377 return KINERT; /* avoid warning */ 378 case '!': /* single line comment (alternative syntax) */ 379 /* this handles the #! style script header too! */ 380 ktok_ignore_single_line_comment(K); 381 continue; 382 case '|': /* nested/multiline comment */ 383 ktok_getc(K); /* discard the '|' */ 384 klisp_assert(K->ktok_nested_comments == 0); 385 K->ktok_nested_comments = 1; 386 ktok_ignore_multi_line_comment(K); 387 continue; 388 case ';': /* sexp comment */ 389 ktok_getc(K); /* discard the ';' */ 390 return G(K)->ktok_sexp_comment; 391 case ':': /* keyword */ 392 ktok_getc(K); /* discard the ':' */ 393 chi = ktok_peekc(K); 394 if (chi == EOF) { 395 ktok_error(K, "# constant is too short"); 396 return KINERT; /* avoid warning */ 397 } else if (chi == '|') { 398 return ktok_read_bar_identifier(K, true); 399 } else if (chi == '\\' || ktok_is_initial(chi)) { 400 return ktok_read_identifier_or_dot(K, true); 401 } else if (chi == '+' || chi == '-') { 402 char ch = (char) chi; 403 ktok_getc(K); /* discard the '+' or '-' */ 404 if (ktok_check_delimiter(K)) { 405 return kkeyword_new_bs(K, &ch, 1); 406 } else { 407 ktok_error_extra(K, "invalid start in keyword", 408 ch2tv(ch)); 409 return KINERT; /* avoid warning */ 410 } 411 } else { 412 ktok_error_extra(K, "invalid char starting keyword", 413 ch2tv((char) chi)); 414 return KINERT; /* avoid warning */ 415 } 416 default: 417 return ktok_read_special(K); 418 } 419 } 420 case '0': case '1': case '2': case '3': case '4': 421 case '5': case '6': case '7': case '8': case '9': { 422 /* positive number, no exactness or radix indicator */ 423 int32_t buf_len = ktok_read_until_delimiter(K); 424 char *buf = ks_tbget_buffer(K); 425 /* read number should free the tbbuffer */ 426 return ktok_read_number(K, buf, buf_len, false, false, false, 10); 427 } 428 case '+': case '-': 429 /* signed number, no exactness or radix indicator */ 430 return ktok_read_maybe_signed_numeric(K); 431 case '\\': /* this is a symbol that starts with an hex escape */ 432 /* These should be kept synchronized with initial */ 433 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 434 case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': 435 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': 436 case 'V': case 'W': case 'X': case 'Y': case 'Z': 437 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 438 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 439 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': 440 case 'v': case 'w': case 'x': case 'y': case 'z': 441 case '!': case '$': case '%': case '&': case '*': case '/': case ':': 442 case '<': case '=': case '>': case '?': case '@': case '^': case '_': 443 case '~': 444 case '.': /* this is either a symbol or a dot token */ 445 /* 446 ** N.B.: the cases for '+', and '-', were already 447 ** considered 448 */ 449 return ktok_read_identifier_or_dot(K, false); 450 default: 451 chi = ktok_getc(K); 452 ktok_error_extra(K, "unrecognized token starting char", 453 ch2tv((char) chi)); 454 /* avoid warning */ 455 return KINERT; 456 } 457 } 458 } 459 460 /* 461 ** Comments and Whitespace 462 */ 463 void ktok_ignore_single_line_comment(klisp_State *K) 464 { 465 int chi; 466 do { 467 chi = ktok_getc(K); 468 } while (chi != EOF && chi != '\n'); 469 } 470 471 void ktok_ignore_multi_line_comment(klisp_State *K) 472 { 473 /* the first "#|' was already read */ 474 klisp_assert(K->ktok_nested_comments == 1); 475 int chi; 476 TValue last_nested_comment_si = ktok_get_source_info(K); 477 krooted_vars_push(K, &last_nested_comment_si); 478 ks_spush(K, KNIL); 479 480 while(K->ktok_nested_comments > 0) { 481 chi = ktok_peekc(K); 482 while (chi != EOF && chi != '|' && chi != '#') { 483 UNUSED(ktok_getc(K)); 484 chi = ktok_peekc(K); 485 } 486 if (chi == EOF) 487 goto eof_error; 488 489 char first_char = (char) chi; 490 491 /* this first char will actually be the same just peeked, that's no 492 problem, it will save the source info the first time around the 493 loop */ 494 chi = ktok_peekc(K); 495 while (chi != EOF && chi == first_char) { 496 ktok_save_source_info(K); 497 UNUSED(ktok_getc(K)); 498 chi = ktok_peekc(K); 499 } 500 if (chi == EOF) 501 goto eof_error; 502 503 UNUSED(ktok_getc(K)); 504 505 if (chi == '#') { 506 /* close comment (first char was '|', so the seq is "|#") */ 507 --K->ktok_nested_comments; 508 last_nested_comment_si = ks_spop(K); 509 } else if (chi == '|') { 510 /* open comment (first char was '#', so the seq is "#|") */ 511 klisp_assert(K->ktok_nested_comments < 1000); 512 ++K->ktok_nested_comments; 513 ks_spush(K, last_nested_comment_si); 514 last_nested_comment_si = ktok_get_source_info(K); 515 } 516 /* else lone '#' or '|', just continue */ 517 } 518 krooted_vars_pop(K); 519 return; 520 eof_error: 521 K->ktok_nested_comments = 0; 522 ktok_save_source_info(K); 523 UNUSED(ktok_getc(K)); 524 krooted_vars_pop(K); 525 ktok_error_extra(K, "unterminated multi line comment", last_nested_comment_si); 526 } 527 528 void ktok_ignore_whitespace(klisp_State *K) 529 { 530 /* NOTE: if it's not whitespace do nothing (even on eof) */ 531 while(true) { 532 int chi = ktok_peekc(K); 533 534 if (chi == EOF) { 535 return; 536 } else { 537 char ch = (char) chi; 538 if (ktok_is_whitespace(ch)) { 539 ktok_getc(K); 540 } else { 541 return; 542 } 543 } 544 } 545 } 546 547 /* 548 ** Delimiter checking 549 */ 550 bool ktok_check_delimiter(klisp_State *K) 551 { 552 int chi = ktok_peekc(K); 553 return (ktok_is_delimiter(chi)); 554 } 555 556 /* 557 ** Returns the number of bytes read 558 */ 559 int32_t ktok_read_until_delimiter(klisp_State *K) 560 { 561 int i = 0; 562 563 while (!ktok_check_delimiter(K)) { 564 /* NOTE: can't be eof, because eof is a delimiter */ 565 char ch = (char) ktok_getc(K); 566 ks_tbadd(K, ch); 567 i++; 568 } 569 ks_tbadd(K, '\0'); 570 return i; 571 } 572 573 /* 574 ** Numbers 575 ** TEMP: for now, only integers & rationals 576 ** The digits are in buf, that must be freed after use, 577 ** len should be at least one 578 */ 579 TValue ktok_read_number(klisp_State *K, char *buf, int32_t len, 580 bool has_exactp, bool exactp, bool has_radixp, 581 int32_t radix) 582 { 583 UNUSED(len); /* not needed really, buf ends with '\0' */ 584 TValue n; 585 if (radix == 10) { 586 /* only allow decimals with radix 10 */ 587 bool decimalp = false; 588 if (!krational_read_decimal(K, buf, radix, &n, NULL, &decimalp)) { 589 /* TODO throw meaningful error msgs, use last param */ 590 ktok_error(K, "Bad format in number"); 591 return KINERT; 592 } 593 if (decimalp && !has_exactp) { 594 /* handle decimal format as an explicit #i */ 595 has_exactp = true; 596 exactp = false; 597 } 598 } else { 599 if (!krational_read(K, buf, radix, &n, NULL)) { 600 /* TODO throw meaningful error msgs, use last param */ 601 ktok_error(K, "Bad format in number"); 602 return KINERT; 603 } 604 } 605 ks_tbclear(K); 606 607 if (has_exactp && !exactp) { 608 krooted_tvs_push(K, n); 609 n = kexact_to_inexact(K, n); 610 krooted_tvs_pop(K); 611 } 612 return n; 613 } 614 615 TValue ktok_read_maybe_signed_numeric(klisp_State *K) 616 { 617 /* NOTE: can't be eof, it's either '+' or '-' */ 618 char ch = (char) ktok_getc(K); 619 if (ktok_check_delimiter(K)) { 620 ks_tbadd(K, ch); 621 ks_tbadd(K, '\0'); 622 /* save the source info in the symbol */ 623 TValue si = ktok_get_source_info(K); 624 krooted_tvs_push(K, si); /* will be popped by throw */ 625 TValue new_sym = ksymbol_new_bs(K, ks_tbget_buffer(K), 1, si); 626 krooted_tvs_pop(K); /* already in symbol */ 627 krooted_tvs_push(K, new_sym); 628 ks_tbclear(K); /* this shouldn't cause gc, but just in case */ 629 krooted_tvs_pop(K); 630 return new_sym; 631 } else { 632 ks_tbadd(K, ch); 633 int32_t buf_len = ktok_read_until_delimiter(K)+1; 634 char *buf = ks_tbget_buffer(K); 635 /* no exactness or radix prefix, default radix: 10 */ 636 return ktok_read_number(K, buf, buf_len, false, false, false, 10); 637 } 638 } 639 640 /* 641 ** Hex escapes for strings and symbols 642 ** "#\xXXXXXX;" 643 ** "#\x" already read 644 */ 645 char ktok_read_hex_escape(klisp_State *K) 646 { 647 /* enough space for any unicode char + 2 */ 648 int ch; 649 char buf[10]; 650 int c = 0; 651 bool at_least_onep = false; 652 for(ch = ktok_getc(K); ch != EOF && ch != ';'; 653 ch = ktok_getc(K)) { 654 if (!ktok_is_digit(ch, 16)) { 655 ktok_error_extra(K, "Invalid char found in hex escape", 656 ch2tv(ch)); 657 return '\0'; /* avoid warning */ 658 } 659 /* 660 ** This will allow one space for '\0' and one extra 661 ** char in case the value is too big, and so will 662 ** naturally result in a value outside the unicode 663 ** range without the need to record any extra 664 ** characters other than the first 8 (without 665 ** leading zeroes). 666 */ 667 at_least_onep = true; 668 if (c < sizeof(buf) - 1 && (c > 0 || ch != '0')) 669 buf[c++] = ch; 670 } 671 if (ch == EOF) { 672 ktok_error(K, "EOF found while reading hex escape"); 673 return '\0'; /* avoid warning */ 674 } else if (!at_least_onep) { 675 ktok_error(K, "Empty hex escape found"); 676 return '\0'; /* avoid warning */ 677 } else if (c == 0) { /* this is the case of a NULL char */ 678 buf[c++] = '0'; 679 } 680 buf[c++] = '\0'; 681 /* buf now contains the hex value of the char */ 682 TValue n; 683 int res = kinteger_read(K, buf, 16, &n, NULL); 684 /* can't fail, all digits were checked already */ 685 klisp_assert(res == true); 686 if (!ttisfixint(n) || ivalue(n) > 127) { 687 krooted_tvs_push(K, n); 688 ktok_error_extra(K, "hex escaped char out of ASCII range", n); 689 return '\0'; /* avoid warning */ 690 } 691 /* all ok, we pass the char */ 692 return (char) ivalue(n); 693 } 694 695 /* 696 ** Strings 697 */ 698 TValue ktok_read_string(klisp_State *K) 699 { 700 /* discard opening quote */ 701 ktok_getc(K); 702 703 bool done = false; 704 int i = 0; 705 706 while(!done) { 707 int ch = ktok_getc(K); 708 just_read: /* this comes from escaped newline */ 709 if (ch == EOF) { 710 ktok_error(K, "EOF found while reading a string"); 711 return KINERT; /* avoid warning */ 712 } else if (ch < 0 || ch > 127) { 713 ktok_error(K, "Non ASCII char found while reading a string"); 714 return KINERT; /* avoid warning */ 715 } 716 717 718 if (ch == '"') { 719 ks_tbadd(K, '\0'); 720 done = true; 721 } else if (ch == '\\') { 722 ch = ktok_getc(K); 723 724 if (ch == EOF) { 725 ktok_error(K, "EOF found while reading a string"); 726 return KINERT; /* avoid warning */ 727 } 728 729 switch(ch) { 730 /* These two will self insert */ 731 case '"': 732 case '\\': 733 break; 734 /* These are naming chars (like in c, mostly) */ 735 case '0': 736 ch = '\0'; 737 break; 738 case 'a': 739 ch = '\a'; 740 break; 741 case 'b': 742 ch = '\b'; 743 break; 744 case 't': 745 ch = '\t'; 746 break; 747 case 'n': 748 ch = '\n'; 749 break; 750 case 'r': 751 ch = '\r'; 752 break; 753 case 'v': 754 ch = '\v'; 755 break; 756 case 'f': 757 ch = '\f'; 758 break; 759 /* 760 ** These signal an escaped newline (not included in string) 761 */ 762 case ' ': 763 case '\t': 764 /* eat up all intraline spacing */ 765 while((ch = ktok_getc(K)) != EOF && 766 (ch == ' ' || ch == '\t')) 767 ; 768 if (ch == EOF) { 769 ktok_error(K, "EOF found while reading a string"); 770 return KINERT; /* avoid warning */ 771 } else if (ch != '\n' && ch != '\r') { 772 ktok_error(K, "Invalid char found after \\ while " 773 "reading a string"); 774 return KINERT; /* avoid warning */ 775 } 776 /* fall through */ 777 case '\n': 778 case '\r': 779 /* use the r6rs definition for line end */ 780 if (ch == 'r') { 781 ch = ktok_peekc(K); 782 if (ch != EOF && ch == '\n') 783 ktok_getc(K); 784 } 785 /* eat up all intraline spacing */ 786 while((ch = ktok_getc(K)) != EOF && 787 (ch == ' ' || ch == '\t')) 788 ; 789 /* this will check for EOF and continue reading the 790 string at the top of the loop */ 791 goto just_read; 792 /* This is an hex escaped char */ 793 case 'x': 794 ch = ktok_read_hex_escape(K); 795 break; 796 default: 797 ktok_error_extra(K, "Invalid char after '\\' " 798 "while reading a string", ch2tv(ch)); 799 return KINERT; /* avoid warning */ 800 } 801 ks_tbadd(K, ch); 802 ++i; 803 } else { 804 ks_tbadd(K, ch); 805 ++i; 806 } 807 } 808 /* TEMP: for now strings "read" are mutable but strings "loaded" are 809 not */ 810 TValue new_str = kstring_new_bs_g(K, K->read_mconsp, 811 ks_tbget_buffer(K), i); 812 krooted_tvs_push(K, new_str); 813 ks_tbclear(K); /* shouldn't cause gc, but still */ 814 krooted_tvs_pop(K); 815 return new_str; 816 } 817 818 /* 819 ** Special constants (starting with "#") 820 ** (Special number syntax, char constants, #ignore, #inert, srfi-38 tokens) 821 */ 822 823 /* this include the named chars as a subcase */ 824 struct kspecial_token { 825 const char *ext_rep; /* downcase external representation */ 826 TValue obj; 827 } kspecial_tokens[] = { { "#t", KTRUE_ }, 828 { "#f", KFALSE_ }, 829 { "#ignore", KIGNORE_ }, 830 { "#inert", KINERT_ }, 831 { "#e+infinity", KEPINF_ }, 832 { "#e-infinity", KEMINF_ }, 833 { "#i+infinity", KIPINF_ }, 834 { "#i-infinity", KIMINF_ }, 835 { "#real", KRWNPV_ }, 836 { "#undefined", KUNDEF_ }, 837 /* 838 ** Character names 839 ** (r7rs + vtab from r6rs) 840 */ 841 { "#\\null", KNULL_ }, 842 { "#\\alarm", KALARM_ }, 843 { "#\\backspace", KBACKSPACE_ }, 844 { "#\\tab", KTAB_ }, 845 { "#\\newline", KNEWLINE_ }, /* kernel */ 846 { "#\\return", KRETURN_ }, 847 { "#\\escape", KESCAPE_ }, 848 { "#\\space", KSPACE_ }, /* kernel */ 849 { "#\\delete", KDELETE_ }, 850 { "#\\vtab", KVTAB_ }, /* r6rs, only */ 851 { "#\\formfeed", KFORMFEED_ } /* r6rs in strings */ 852 }; 853 854 #define MAX_EXT_REP_SIZE 64 /* all special tokens have less than 64 chars */ 855 856 TValue ktok_read_special(klisp_State *K) 857 { 858 /* the # is already consumed, add it manually */ 859 ks_tbadd(K, '#'); 860 int32_t buf_len = ktok_read_until_delimiter(K) + 1; 861 char *buf = ks_tbget_buffer(K); 862 863 if (buf_len < 2) { 864 /* we need at least one char in addition to the '#' */ 865 ktok_error(K, "# constant is too short"); 866 /* avoid warning */ 867 return KINERT; 868 } 869 870 /* first check that is not an output only representation, 871 they begin with '#[' and end with ']', but we know 872 that buf[0] == '#' */ 873 if (buf_len > 2 && buf[1] == '[' && buf[buf_len-1] == ']') { 874 ktok_error(K, "output only representation found"); 875 /* avoid warning */ 876 return KINERT; 877 } 878 879 /* Then check for simple chars, this is the only thing 880 that is case dependant, so after this we downcase buf 881 (except that an escaped char needs a small 'x' */ 882 /* REFACTOR: move this to a new function */ 883 /* char constant, needs at least 3 chars unless it's a delimiter 884 * char! */ 885 if (buf_len == 2 && buf[1] == '\\') { 886 /* was a delimiter char... read it */ 887 int ch_i = ktok_getc(K); 888 if (ch_i == EOF) { 889 ktok_error(K, "EOF found while reading character name"); 890 return KINERT; /* avoid warning */ 891 } 892 ks_tbclear(K); 893 return ch2tv((char)ch_i); 894 } else if (buf[1] == '\\') { 895 /* 896 ** RATIONALE: in the scheme spec (R5RS) it says that only alphabetic 897 ** char constants need a delimiter to disambiguate the cases with 898 ** character names. It would be more consistent if all characters 899 ** needed a delimiter (and is probably implied by the yet incomplete 900 ** Kernel report (R-1RK)) 901 ** For now we follow the scheme report 902 */ 903 char ch = buf[2]; /* we know buf_len > 2 */ 904 905 if (ch < 0 || ch > 127) { 906 ktok_error(K, "Non ASCII char found as character constant"); 907 /* avoid warning */ 908 return KINERT; 909 } 910 911 if (!ktok_is_alphabetic(ch) || buf_len == 3) { /* simple char */ 912 ks_tbclear(K); 913 return ch2tv(ch); 914 } 915 916 /* char names are a subcase of special tokens so this case 917 will be handled later */ 918 /* fall through */ 919 } 920 921 /* first save the third char, in case it's an hex escaped char 922 (that should be a lowercase x) */ 923 char saved_third = buf[2]; /* there's at least 2 chars, so in the worst 924 case buf[2] is just '\0' */ 925 926 /* now, we ignore case in all remaining comparisons */ 927 size_t i = 0; 928 for(char *str2 = buf; i < buf_len; ++str2, ++i) 929 *str2 = tolower(*str2); 930 931 /* REFACTOR: move this to a new function */ 932 /* then check the known constants (including named characters) */ 933 size_t stok_size = sizeof(kspecial_tokens) / 934 sizeof(struct kspecial_token); 935 for (i = 0; i < stok_size; i++) { 936 struct kspecial_token token = kspecial_tokens[i]; 937 /* NOTE: must check type because buf may contain embedded '\0's */ 938 if (buf_len == strlen(token.ext_rep) && 939 strcmp(token.ext_rep, buf) == 0) { 940 ks_tbclear(K); 941 return token.obj; 942 } 943 } 944 945 /* It wasn't a special token or named char, but it can still be a srfi-38 946 token or a character escape */ 947 948 if (buf[1] == '\\') { /* this is to have a meaningful error msg */ 949 if (saved_third != 'x') { /* case is significant here, so 950 we use the saved char */ 951 ktok_error(K, "Unrecognized character name"); 952 return KINERT; 953 } 954 /* We already checked that length != 3 (x is alphabetic), 955 so there's at least on more char */ 956 TValue n; 957 char *end; 958 959 /* test for - and + explicitly, becayse kinteger read would parse them 960 without complaining (it will also parse spaces, but we read until 961 delimiter so... */ 962 if (buf[3] == '-' || buf[3] == '+' || 963 !kinteger_read(K, buf+3, 16, &n, &end) || 964 end - buf != buf_len) { 965 ktok_error(K, "Bad char in hex escaped character constant"); 966 return KINERT; 967 } else if (!ttisfixint(n) || ivalue(n) > 127) { 968 ktok_error(K, "Non ASCII char found in hex escaped character constant"); 969 /* avoid warning */ 970 return KINERT; 971 } else { 972 /* all ok, we just clean up and return the char */ 973 ks_tbclear(K); 974 return ch2tv(ivalue(n)); 975 } 976 } 977 978 /* REFACTOR: move this to a new function */ 979 /* It was not a special token so it must be either a srfi-38 style 980 token, or a number. srfi-38 tokens are a '#' a 981 decimal number and end with a '=' or a '#' */ 982 if (buf_len > 2 && ktok_is_numeric(buf[1])) { 983 /* NOTE: it's important to check is_numeric to avoid problems with 984 sign in kinteger_read */ 985 /* srfi-38 type token (can be either a def or ref) */ 986 /* TODO: lift this implementation restriction */ 987 /* IMPLEMENTATION RESTRICTION: only allow fixints in shared tokens */ 988 char ch = buf[buf_len-1]; /* remember last char */ 989 buf[buf_len-1] = '\0'; /* replace last char with 0 to read number */ 990 991 if (ch != '#' && ch != '=') { 992 ktok_error(K, "Missing last char in srfi-38 token"); 993 return KINERT; 994 } /* else buf[i] == '#' or '=' */ 995 TValue n; 996 char *end; 997 /* 10 is the radix for srfi-38 tokens, buf+1 to jump over the '#', 998 end+1 to count the last char */ 999 /* N.B. buf+1 can't be + or -, we already tested numeric before */ 1000 if (!kinteger_read(K, buf+1, 10, &n, &end) || end+1 - buf != buf_len) { 1001 ktok_error(K, "Bad char in srfi-38 token"); 1002 return KINERT; 1003 } else if (!ttisfixint(n)) { 1004 ktok_error(K, "IMP. RESTRICTION: shared token too big"); 1005 /* avoid warning */ 1006 return KINERT; 1007 } 1008 ks_tbclear(K); 1009 /* GC: no need to root n, for now it's a fixint */ 1010 return kcons(K, ch2tv(ch), n); 1011 } 1012 1013 /* REFACTOR: move to new function */ 1014 1015 /* the only possibility left is that it is a number with 1016 an exactness or radix refix */ 1017 bool has_exactp = false; 1018 bool exactp = false; /* the default exactness will depend on the format */ 1019 bool has_radixp = false; 1020 int32_t radix = 10; 1021 1022 int32_t idx = 1; 1023 while (idx < buf_len) { 1024 char ch = buf[idx]; 1025 switch(ch) { 1026 case 'i': 1027 case 'e': 1028 if (has_exactp) { 1029 ktok_error(K, "two exactness prefixes in number"); 1030 return KINERT; 1031 } 1032 has_exactp = true; 1033 exactp = (ch == 'e'); 1034 break; 1035 case 'b': radix = 2; goto RADIX; 1036 case 'o': radix = 8; goto RADIX; 1037 case 'd': radix = 10; goto RADIX; 1038 case 'x': radix = 16; goto RADIX; 1039 RADIX: 1040 if (has_radixp) { 1041 ktok_error(K, "two radix prefixes in number"); 1042 return KINERT; 1043 } 1044 has_radixp = true; 1045 break; 1046 default: 1047 ktok_error(K, "unknown # constant or " 1048 "unexpected char in number after #"); 1049 /* avoid warning */ 1050 return KINERT; 1051 } 1052 ++idx; 1053 if (idx == buf_len) 1054 break; 1055 ch = buf[idx]; 1056 1057 switch(ch) { 1058 case '#': { 1059 ++idx; /* get next exacness or radix prefix */ 1060 break; 1061 } 1062 case '0': case '1': case '2': case '3': case '4': 1063 case '5': case '6': case '7': case '8': case '9': 1064 case 'a': case 'b': case 'c': case 'd': case 'e': 1065 case 'f': case '+': case '-': { /* read the number */ 1066 if (idx == buf_len) { 1067 ktok_error(K, "no digits found in number"); 1068 } else { 1069 return ktok_read_number(K, buf+idx, buf_len - idx, 1070 has_exactp, exactp, 1071 has_radixp, radix); 1072 } 1073 } 1074 default: 1075 ktok_error(K, "unexpected char in number"); 1076 /* avoid warning */ 1077 return KINERT; 1078 } 1079 } 1080 /* this means that the number wasn't found after the prefixes */ 1081 ktok_error(K, "no digits found in number"); 1082 /* avoid warning */ 1083 return KINERT; 1084 } 1085 1086 /* 1087 ** Identifiers & Keywords (and dot token) 1088 */ 1089 TValue ktok_read_identifier_or_dot(klisp_State *K, bool keywordp) 1090 { 1091 bool seen_dot = false; 1092 int32_t i = 0; 1093 while (!ktok_check_delimiter(K)) { 1094 /* NOTE: can't be eof, because eof is a delimiter */ 1095 char ch = (char) ktok_getc(K); 1096 /* this is needed to differentiate a dot from an equivalent escape */ 1097 seen_dot |= ch == '.'; 1098 /* NOTE: is_subsequent of '\0' is false, so no embedded '\0' */ 1099 if (ktok_is_subsequent(ch)) { 1100 /* downcase all non-escaped chars */ 1101 ks_tbadd(K, tolower(ch)); 1102 ++i; 1103 } else if (ch == '\\') { 1104 /* should be inline hex escape */ 1105 ch = ktok_getc(K); 1106 if (ch == EOF) { 1107 ktok_error(K, "EOF found while reading character escape"); 1108 } else if (ch != 'x') { 1109 ktok_error_extra(K, keywordp? 1110 "Invalid char after \\ in keyword" : 1111 "Invalid char after \\ in identifier", 1112 ch2tv((char)ch)); 1113 } 1114 ch = ktok_read_hex_escape(K); 1115 /* don't downcase escaped chars */ 1116 ks_tbadd(K, ch); 1117 ++i; 1118 } else { 1119 ktok_error_extra(K, keywordp? "Invalid char in keyword" : 1120 "Invalid char in identifier", ch2tv((char)ch)); 1121 } 1122 } 1123 1124 if (i == 1 && seen_dot) { 1125 if (keywordp) { 1126 ktok_error(K, "Invalid syntax in keyword"); 1127 return KINERT; /* avoid warning */ 1128 } else { 1129 ks_tbclear(K); 1130 return G(K)->ktok_dot; 1131 } 1132 } 1133 1134 ks_tbadd(K, '\0'); 1135 TValue new_obj; 1136 if (keywordp) { 1137 new_obj = kkeyword_new_bs(K, ks_tbget_buffer(K), i); 1138 } else { 1139 TValue si = ktok_get_source_info(K); 1140 krooted_tvs_push(K, si); /* will be popped by throw */ 1141 new_obj = ksymbol_new_bs(K, ks_tbget_buffer(K), i, si); 1142 krooted_tvs_pop(K); /* already in symbol */ 1143 } 1144 krooted_tvs_push(K, new_obj); 1145 ks_tbclear(K); /* this shouldn't cause gc, but just in case */ 1146 krooted_tvs_pop(K); 1147 return new_obj; 1148 } 1149 1150 TValue ktok_read_bar_identifier(klisp_State *K, bool keywordp) 1151 { 1152 /* discard opening bar */ 1153 ktok_getc(K); 1154 1155 bool done = false; 1156 int i = 0; 1157 1158 /* Never downcase chars in |...| escaped symbols */ 1159 while(!done) { 1160 int ch = ktok_getc(K); 1161 if (ch == EOF) { 1162 ktok_error(K, keywordp? 1163 "EOF found while reading a #:|keyword|" : 1164 "EOF found while reading an |identifier|"); 1165 return KINERT; /* avoid warning */ 1166 } else if (ch < 0 || ch > 127) { 1167 ktok_error(K, keywordp? 1168 "Non ASCII char found while reading a #:|keyword|" : 1169 "Non ASCII char found while reading an |identifier|"); 1170 return KINERT; /* avoid warning */ 1171 } 1172 1173 if (ch == '|') { 1174 ks_tbadd(K, '\0'); 1175 done = true; 1176 } else if (ch == '\\') { 1177 ch = ktok_getc(K); 1178 1179 if (ch == EOF) { 1180 ktok_error(K, keywordp? 1181 "EOF found while reading a #:|keyword|" : 1182 "EOF found while reading an |identifier|"); 1183 return KINERT; /* avoid warning */ 1184 } 1185 1186 switch(ch) { 1187 /* These two will self insert */ 1188 case '|': 1189 case '\\': 1190 break; 1191 case 'x': 1192 ch = ktok_read_hex_escape(K); 1193 break; 1194 default: 1195 ktok_error_extra(K, keywordp? 1196 "Invalid char after '\\' while reading a " 1197 "#:|keyword|" : 1198 "Invalid char after '\\' while reading an " 1199 "|identifier|", ch2tv(ch)); 1200 return KINERT; /* avoid warning */ 1201 } 1202 ks_tbadd(K, ch); 1203 ++i; 1204 } else { 1205 ks_tbadd(K, ch); 1206 ++i; 1207 } 1208 } 1209 TValue new_obj; 1210 if (keywordp) { 1211 new_obj = kkeyword_new_bs(K, ks_tbget_buffer(K), i); 1212 } else { 1213 TValue si = ktok_get_source_info(K); 1214 krooted_tvs_push(K, si); /* will be popped by throw */ 1215 new_obj = ksymbol_new_bs(K, ks_tbget_buffer(K), i, si); 1216 krooted_tvs_pop(K); /* already in symbol */ 1217 } 1218 krooted_tvs_push(K, new_obj); 1219 ks_tbclear(K); /* this shouldn't cause gc, but just in case */ 1220 krooted_tvs_pop(K); 1221 return new_obj; 1222 } 1223