klisp

an open source interpreter for the Kernel Programming Language.
git clone http://git.hanabi.in/repos/klisp.git
Log | Files | Refs | README

commit 207af60949a35641f556f5644702c2187f9e8bce
parent 9a95a9f6f843044e72cee78d5429d5b0c4481588
Author: Andres Navarro <canavarro82@gmail.com>
Date:   Wed, 30 Nov 2011 07:21:34 -0300

Added reader/writer support for symbol escapes (both hex escapes and |...| escaped symbols. Bugfix: only #\x works in hex escaped chars (not #\X). Added '.' as a possible initial character for symbols This follows scheme. Some light refactoring.

Diffstat:
MTODO | 3+--
Msrc/Makefile | 4++--
Msrc/kghelpers.h | 10+++++-----
Msrc/kgsymbols.c | 2+-
Msrc/klisp.c | 4++--
Msrc/kobject.h | 5-----
Msrc/kstate.c | 4++--
Msrc/ksymbol.c | 69++++++++++++++-------------------------------------------------------
Msrc/ksymbol.h | 14++++++++------
Msrc/ktoken.c | 305++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------
Msrc/ktoken.h | 7++++---
Msrc/kwrite.c | 105++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
12 files changed, 328 insertions(+), 204 deletions(-)

diff --git a/TODO b/TODO @@ -34,8 +34,7 @@ * applicatives: ** number->string (r7rs) ** string->number (r7rs) -* reader -** symbol escapes (r7rs) +* reader/writer ** syntax support for complex numbers (Kernel report) * other ** optional argument to member? (r7rs) diff --git a/src/Makefile b/src/Makefile @@ -280,7 +280,7 @@ kstate.o: kstate.c klisp.h kobject.h klimits.h klispconf.h kstate.h \ kstring.o: kstring.c kstring.h kobject.h klimits.h klisp.h klispconf.h \ kstate.h ktoken.h kmem.h kgc.h ksymbol.o: ksymbol.c ksymbol.h kobject.h klimits.h klisp.h klispconf.h \ - kstate.h ktoken.h kmem.h kstring.h kgc.h + kstate.h kmem.h kstring.h kgc.h ksystem.o: ksystem.c kobject.h klimits.h klisp.h klispconf.h kstate.h \ ktoken.h kmem.h kerror.h kpair.h kgc.h ksystem.h ksystem.posix.o: ksystem.posix.c kobject.h klimits.h klisp.h klispconf.h \ @@ -298,7 +298,7 @@ kvector.o: kvector.c kvector.h kobject.h klimits.h klisp.h klispconf.h \ kwrite.o: kwrite.c kwrite.h kobject.h klimits.h klisp.h klispconf.h \ kstate.h ktoken.h kmem.h kinteger.h imath.h krational.h imrat.h kreal.h \ kpair.h kgc.h kstring.h ksymbol.h kerror.h ktable.h kport.h \ - kenvironment.h kbytevector.h kvector.h + kenvironment.h kbytevector.h kvector.h ktoken.h imath.o: imath.c imath.h kobject.h klimits.h klisp.h klispconf.h kstate.h \ ktoken.h kmem.h kerror.h kpair.h kgc.h imrat.o: imrat.c imrat.h imath.h kobject.h klimits.h klisp.h klispconf.h \ diff --git a/src/kghelpers.h b/src/kghelpers.h @@ -515,7 +515,7 @@ TValue map_for_each_transpose(klisp_State *K, TValue lss, /* TODO add si to the symbols */ #if KTRACK_SI #define add_operative(K_, env_, n_, fn_, ...) \ - { symbol = ksymbol_new(K_, n_, KNIL); \ + { symbol = ksymbol_new_b(K_, n_, KNIL); \ value = kmake_operative(K_, fn_, __VA_ARGS__); \ TValue str = kstring_new_b_imm(K_, __FILE__); \ TValue si = kcons(K, str, kcons(K_, i2tv(__LINE__), \ @@ -524,7 +524,7 @@ TValue map_for_each_transpose(klisp_State *K, TValue lss, kadd_binding(K_, env_, symbol, value); } #define add_applicative(K_, env_, n_, fn_, ...) \ - { symbol = ksymbol_new(K_, n_, KNIL); \ + { symbol = ksymbol_new_b(K_, n_, KNIL); \ value = kmake_applicative(K_, fn_, __VA_ARGS__); \ TValue str = kstring_new_b_imm(K_, __FILE__); \ TValue si = kcons(K, str, kcons(K_, i2tv(__LINE__), \ @@ -534,19 +534,19 @@ TValue map_for_each_transpose(klisp_State *K, TValue lss, kadd_binding(K_, env_, symbol, value); } #else /* KTRACK_SI */ #define add_operative(K_, env_, n_, fn_, ...) \ - { symbol = ksymbol_new(K_, n_, KNIL); \ + { symbol = ksymbol_new_b(K_, n_, KNIL); \ value = kmake_operative(K_, fn_, __VA_ARGS__); \ kadd_binding(K_, env_, symbol, value); } #define add_applicative(K_, env_, n_, fn_, ...) \ - { symbol = ksymbol_new(K_, n_); \ + { symbol = ksymbol_new_b(K_, n_, KNIL); \ value = kmake_applicative(K_, fn_, __VA_ARGS__); \ kadd_binding(K_, env_, symbol, value); } #endif /* KTRACK_SI */ #define add_value(K_, env_, n_, v_) \ { value = v_; \ - symbol = ksymbol_new(K_, n_, KNIL); \ + symbol = ksymbol_new_b(K_, n_, KNIL); \ kadd_binding(K_, env_, symbol, v_); } #endif diff --git a/src/kgsymbols.c b/src/kgsymbols.c @@ -61,7 +61,7 @@ void string_to_symbol(klisp_State *K) UNUSED(denv); bind_1tp(K, ptree, "string", ttisstring, str); /* TODO si */ - TValue new_sym = ksymbol_new_check_i(K, str, KNIL); + TValue new_sym = ksymbol_new_str(K, str, KNIL); kapply_cc(K, new_sym); } diff --git a/src/klisp.c b/src/klisp.c @@ -548,7 +548,7 @@ static void populate_argument_lists(klisp_State *K, char **argv, int argc, tail = kimm_cons(K, obj, tail); } /* Store the script argument list */ - obj = ksymbol_new(K, "get-script-arguments", KNIL); + obj = ksymbol_new_b(K, "get-script-arguments", KNIL); klisp_assert(kbinds(K, K->ground_env, obj)); obj = kunwrap(kget_binding(K, K->ground_env, obj)); tv2op(obj)->extra[0] = tail; @@ -559,7 +559,7 @@ static void populate_argument_lists(klisp_State *K, char **argv, int argc, tail = kimm_cons(K, obj, tail); } /* Store the interpreter argument list */ - obj = ksymbol_new(K, "get-interpreter-arguments", KNIL); + obj = ksymbol_new_b(K, "get-interpreter-arguments", KNIL); klisp_assert(kbinds(K, K->ground_env, obj)); obj = kunwrap(kget_binding(K, K->ground_env, obj)); tv2op(obj)->extra[0] = tail; diff --git a/src/kobject.h b/src/kobject.h @@ -822,11 +822,6 @@ int32_t kmark_count; #define kis_mutable(o_) ((tv_get_kflags(o_) & K_FLAG_IMMUTABLE) == 0) #define kis_immutable(o_) (!kis_mutable(o_)) -/* KFlags for symbols */ -/* has external representation (identifiers) */ -#define K_FLAG_EXT_REP 0x01 -#define khas_ext_rep(s_) ((tv_get_kflags(s_) & K_FLAG_EXT_REP) != 0) - /* KFlags for marking continuations */ #define K_FLAG_OUTER 0x01 #define K_FLAG_INNER 0x02 diff --git a/src/kstate.c b/src/kstate.c @@ -223,7 +223,7 @@ klisp_State *klisp_newstate (klisp_Alloc f, void *ud) { kset_source_info(K, K->eval_op, si); /* TODO: si */ - TValue eval_name = ksymbol_new(K, "eval", KNIL); + TValue eval_name = ksymbol_new_b(K, "eval", KNIL); ktry_set_name(K, K->eval_op, eval_name); K->list_app = kmake_applicative(K, list, 0), line_number = __LINE__; @@ -238,7 +238,7 @@ klisp_State *klisp_newstate (klisp_Alloc f, void *ud) { /* MAYBE: fix it so we can remove module_params_sym from roots */ /* TODO si */ - K->module_params_sym = ksymbol_new(K, "module-parameters", KNIL); + K->module_params_sym = ksymbol_new_b(K, "module-parameters", KNIL); /* Create the root and error continuation (will be added to the environment in kinit_ground_env) */ diff --git a/src/ksymbol.c b/src/ksymbol.c @@ -8,8 +8,6 @@ #include "ksymbol.h" #include "kobject.h" -/* for identifier checking */ -#include "ktoken.h" #include "kstate.h" #include "kmem.h" #include "kgc.h" @@ -19,10 +17,13 @@ /* NOTE: symbols can have source info, they should be compared with tv_sym_equal, NOT tv_equal */ -/* TEMP: for now only interned symbols are the ones that don't - have source info (like those created with string->symbol) */ -TValue ksymbol_new_g(klisp_State *K, const char *buf, int32_t size, - TValue si, bool identifierp) +/* No case folding is performed by these constructors */ + +/* +** Interned symbols are only the ones that don't have source info +** (like those created with string->symbol) +*/ +TValue ksymbol_new_bs(klisp_State *K, const char *buf, int32_t size, TValue si) { /* First calculate the hash */ uint32_t h = size; /* seed */ @@ -71,7 +72,7 @@ TValue ksymbol_new_g(klisp_State *K, const char *buf, int32_t size, differently */ new_sym->gct = klispC_white(K); new_sym->tt = K_TSYMBOL; - new_sym->kflags = identifierp? K_FLAG_EXT_REP : 0; + new_sym->kflags = 0; new_sym->si = NULL; /* symbol specific fields */ @@ -93,8 +94,7 @@ TValue ksymbol_new_g(klisp_State *K, const char *buf, int32_t size, } else { /* non nil source info */ /* link it with regular objects and save source info */ /* header + gc_fields */ - klispC_link(K, (GCObject *) new_sym, K_TSYMBOL, - identifierp? K_FLAG_EXT_REP : 0); + klispC_link(K, (GCObject *) new_sym, K_TSYMBOL, 0); /* symbol specific fields */ new_sym->str = new_str; @@ -107,59 +107,18 @@ TValue ksymbol_new_g(klisp_State *K, const char *buf, int32_t size, return ret_tv; } -/* for indentifiers */ -TValue ksymbol_new_i(klisp_State *K, const char *buf, int32_t size, TValue si) -{ - return ksymbol_new_g(K, buf, size, si, true); -} - -/* for indentifiers with no size */ -TValue ksymbol_new(klisp_State *K, const char *buf, TValue si) +/* for c strings with unknown size */ +TValue ksymbol_new_b(klisp_State *K, const char *buf, TValue si) { int32_t size = (int32_t) strlen(buf); - return ksymbol_new_g(K, buf, size, si, true); + return ksymbol_new_bs(K, buf, size, si); } /* for string->symbol */ /* GC: assumes str is rooted */ -TValue ksymbol_new_check_i(klisp_State *K, TValue str, TValue si) +TValue ksymbol_new_str(klisp_State *K, TValue str, TValue si) { - int32_t size = kstring_size(str); - char *buf = kstring_buf(str); - bool identifierp; - - /* this is necessary because the empty symbol isn't an identifier */ - /* MAYBE it should throw an error if the string is empty */ - /* XXX: The exact syntax for identifiers isn't there in the report - yet, here we use something like scheme, and the same as in ktoken.h - (details, leading numbers '.', '+' and '-' are a no go, but '+' and - '-' are an exception. - */ - identifierp = (size > 0); - if (identifierp) { - char first = *buf; - buf++; - size--; - if (first == '+' || first == '-') - identifierp = (size == 0); - else if (first == '.' || ktok_is_numeric(first)) - identifierp = false; - else - identifierp = ktok_is_subsequent(first); - - while(identifierp && size--) { - if (!ktok_is_subsequent(*buf)) - identifierp = false; - else - buf++; - } - } - /* recover size & buf*/ - size = kstring_size(str); - buf = kstring_buf(str); - - TValue new_sym = ksymbol_new_g(K, buf, size, si, identifierp); - return new_sym; + return ksymbol_new_bs(K, kstring_buf(str), kstring_size(str), si); } bool ksymbolp(TValue obj) { return ttissymbol(obj); } diff --git a/src/ksymbol.h b/src/ksymbol.h @@ -17,13 +17,15 @@ /* NOTE: symbols can have source info, they should be compared with tv_sym_equal, NOT tv_equal */ -/* For identifiers */ -TValue ksymbol_new_i(klisp_State *K, const char *buf, int32_t size, +/* No case folding is performed by these constructors */ + +/* buffer + size, may contain nulls */ +TValue ksymbol_new_bs(klisp_State *K, const char *buf, int32_t size, TValue si); -/* For identifiers, simplified for unknown size */ -TValue ksymbol_new(klisp_State *K, const char *buf, TValue si); -/* For general strings, copies str if not immutable */ -TValue ksymbol_new_check_i(klisp_State *K, TValue str, TValue si); +/* null terminated buffer */ +TValue ksymbol_new_b(klisp_State *K, const char *buf, TValue si); +/* copies str if not immutable */ +TValue ksymbol_new_str(klisp_State *K, TValue str, TValue si); #define ksymbol_str(tv_) (tv2sym(tv_)->str) #define ksymbol_buf(tv_) (kstring_buf(tv2sym(tv_)->str)) diff --git a/src/ktoken.c b/src/ktoken.c @@ -5,23 +5,10 @@ */ /* -** Symbols should be converted to some standard case before interning -** (in this case downcase) -*/ - -/* ** TODO: ** -** From the Report: -** -** - Support for complete number syntax (complex) -** -** NOT from the Report: +** - Support for complete number syntax (complex) (report) ** - Support for unicode (strings, char and symbols). -** - srfi-30 stype #| ... |# nested comments and srfi-62 style #; -** sexp comments. -** - more named chars (like #\tab and in strings "\t") -** - numeric escaped chars (like #\u0020) ** */ #include <stdio.h> @@ -86,7 +73,8 @@ void kcharset_union(kcharset chs, kcharset chs2) ** Character sets for classification */ kcharset ktok_alphabetic, ktok_numeric, ktok_whitespace; -kcharset ktok_delimiter, ktok_extended, ktok_subsequent; +kcharset ktok_delimiter, ktok_extended; +kcharset ktok_initial, ktok_subsequent; /* ** Special Tokens @@ -101,7 +89,9 @@ kcharset ktok_delimiter, ktok_extended, ktok_subsequent; ** char in the car and nil in the cdr. ** srfi-38 tokens are also represented with a char in the car indicating if ** it's a defining token ('=') or a referring token ('#') and the number in -** the cdr. This way a special token can be easily tested for (with ttispair) +** the cdr. +** The sexp comment token with a ';' in the car. +** This way a special token can be easily tested for (with ttispair) ** and easily classified (with switch(chvalue(kcar(tok)))). ** */ @@ -118,10 +108,18 @@ void ktok_init(klisp_State *K) kcharset_fill(ktok_delimiter, "()\";"); kcharset_union(ktok_delimiter, ktok_whitespace); - kcharset_fill(ktok_extended, "!$%&*+-./:<=>?@^_~"); + kcharset_fill(ktok_initial, "!$%&*./:<=>?@^_~"); + kcharset_union(ktok_initial, ktok_alphabetic); + + /* N.B. Unlike in scheme, kernel admits both '.' and + '@' as initial chars in identifiers, but doesn't allow + '+' or '-'. There are 3 exceptions: + both '+' and '-' alone are identifiers and '.' alone is + not an identifier */ + kcharset_fill(ktok_extended, "+-"); kcharset_empty(ktok_subsequent); - kcharset_union(ktok_subsequent, ktok_alphabetic); + kcharset_union(ktok_subsequent, ktok_initial); kcharset_union(ktok_subsequent, ktok_numeric); kcharset_union(ktok_subsequent, ktok_extended); } @@ -314,13 +312,15 @@ void ktok_ignore_whitespace(klisp_State *K); void ktok_ignore_single_line_comment(klisp_State *K); void ktok_ignore_multi_line_comment(klisp_State *K); bool ktok_check_delimiter(klisp_State *K); +char ktok_read_hex_escape(klisp_State *K); TValue ktok_read_string(klisp_State *K); TValue ktok_read_special(klisp_State *K); TValue ktok_read_number(klisp_State *K, char *buf, int32_t len, bool has_exactp, bool exactp, bool has_radixp, int32_t radix); TValue ktok_read_maybe_signed_numeric(klisp_State *K); -TValue ktok_read_identifier(klisp_State *K); +TValue ktok_read_identifier_or_dot(klisp_State *K); +TValue ktok_read_bar_identifier(klisp_State *K); int ktok_read_until_delimiter(klisp_State *K); /* @@ -357,17 +357,10 @@ TValue ktok_read_token(klisp_State *K) case ')': ktok_getc(K); return K->ktok_rparen; - case '.': - ktok_getc(K); - if (ktok_check_delimiter(K)) - return K->ktok_dot; - else { - ktok_error(K, "no delimiter found after dot"); - /* avoid warning */ - return KINERT; - } case '"': return ktok_read_string(K); + case '|': + return ktok_read_bar_identifier(K); /* TODO use read_until_delimiter in all these cases */ case '#': { ktok_getc(K); @@ -405,6 +398,8 @@ TValue ktok_read_token(klisp_State *K) case '+': case '-': /* signed number, no exactness or radix indicator */ return ktok_read_maybe_signed_numeric(K); + case '\\': /* this is a symbol that starts with an hex escape */ + /* These should be kept synchronized with initial */ case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': @@ -416,27 +411,14 @@ TValue ktok_read_token(klisp_State *K) case '!': case '$': case '%': case '&': case '*': case '/': case ':': case '<': case '=': case '>': case '?': case '@': case '^': case '_': case '~': + case '.': /* this is either a symbol or a dot token */ /* - ** NOTE: the cases for '+', '-', '.' and numbers were already - ** considered so identifier-subsequent is used instead of - ** identifier-first-char (in the cases above) + ** N.B.: the cases for '+', and '-', were already + ** considered */ - return ktok_read_identifier(K); - case '|': - ktok_getc(K); - chi = ktok_peekc(K); - if (chi == EOF || chi != '#') { - chi = '|'; - goto unrecognized_error; - } - ktok_getc(K); - ktok_error(K, "unmatched multiline comment close (\"|#\")"); - /* avoid warning */ - return KINERT; + return ktok_read_identifier_or_dot(K); default: chi = ktok_getc(K); - /* TODO add char to error */ - unrecognized_error: ktok_error_extra(K, "unrecognized token starting char", ch2tv((char) chi)); /* avoid warning */ @@ -610,7 +592,7 @@ TValue ktok_read_maybe_signed_numeric(klisp_State *K) /* save the source info in the symbol */ TValue si = ktok_get_source_info(K); krooted_tvs_push(K, si); /* will be popped by throw */ - TValue new_sym = ksymbol_new_i(K, ks_tbget_buffer(K), 1, si); + TValue new_sym = ksymbol_new_bs(K, ks_tbget_buffer(K), 1, si); krooted_tvs_pop(K); /* already in symbol */ krooted_tvs_push(K, new_sym); ks_tbclear(K); /* this shouldn't cause gc, but just in case */ @@ -626,6 +608,62 @@ TValue ktok_read_maybe_signed_numeric(klisp_State *K) } /* +** Hex escapes for strings and symbols +** "#\xXXXXXX;" +** "#\x" already read +*/ + +char ktok_read_hex_escape(klisp_State *K) +{ + /* enough space for any unicode char + 2 */ + int ch; + char buf[10]; + int c = 0; + bool at_least_onep = false; + for(ch = ktok_getc(K); ch != EOF && ch != ';'; + ch = ktok_getc(K)) { + if (!ktok_is_digit(ch, 16)) { + ktok_error_extra(K, "Invalid char found in hex escape", + ch2tv(ch)); + return '\0'; /* avoid warning */ + } + /* + ** This will allow one space for '\0' and one extra + ** char in case the value is too big, and so will + ** naturally result in a value outside the unicode + ** range without the need to record any extra + ** characters other than the first 8 (without + ** leading zeroes). + */ + at_least_onep = true; + if (c < sizeof(buf) - 1 && (c > 0 || ch != '0')) + buf[c++] = ch; + } + if (ch == EOF) { + ktok_error(K, "EOF found while reading hex escape"); + return '\0'; /* avoid warning */ + } else if (!at_least_onep) { + ktok_error(K, "Empty hex escape found"); + return '\0'; /* avoid warning */ + } else if (c == 0) { /* this is the case of a NULL char */ + buf[c++] = '0'; + } + buf[c++] = '\0'; + /* buf now contains the hex value of the char */ + TValue n; + int res = kinteger_read(K, buf, 16, &n, NULL); + /* can't fail, all digits were checked already */ + klisp_assert(res == true); + if (!ttisfixint(n) || ivalue(n) > 127) { + krooted_tvs_push(K, n); + ktok_error_extra(K, "hex escaped char out of ASCII range", n); + return '\0'; /* avoid warning */ + } + /* all ok, we pass the char */ + return (char) ivalue(n); +} + +/* ** Strings */ TValue ktok_read_string(klisp_State *K) @@ -723,55 +761,9 @@ TValue ktok_read_string(klisp_State *K) string at the top of the loop */ goto just_read; /* This is an hex escaped char */ - case 'x': { - /* enough space for any unicode char + 2 */ - char buf[10]; - int c = 0; - bool at_least_onep = false; - for(ch = ktok_getc(K); ch != EOF && ch != ';'; - ch = ktok_getc(K)) { - if (!ktok_is_digit(ch, 16)) { - ktok_error_extra(K, "Invalid char found in string " - "hex escape", ch2tv(ch)); - return KINERT; /* avoid warning */ - } - /* - ** This will allow one space for '\0' and one extra - ** char in case the value is too big, and so will - ** naturally result in a value outside the unicode - ** range without the need to record any extra - ** characters other than the first 8 (without - ** leading zeroes). - */ - at_least_onep = true; - if (c < sizeof(buf) - 1 && (c > 0 || ch != '0')) - buf[c++] = ch; - } - if (ch == EOF) { - ktok_error(K, "EOF found while reading a string"); - return KINERT; /* avoid warning */ - } else if (!at_least_onep) { - ktok_error(K, "Empty hex escape found while reading " - "a string"); - return KINERT; /* avoid warning */ - } else if (c == 0) { /* this is the case of a NULL char */ - buf[c++] = '0'; - } - buf[c++] = '\0'; - /* buf now contains the hex value of the char */ - TValue n; - int res = kinteger_read(K, buf, 16, &n, NULL); - /* can't fail, all digits were checked already */ - klisp_assert(res == true); - if (!ttisfixint(n) || ivalue(n) > 127) { - ktok_error(K, "hex escaped char out of ASCII range " - "while reading a string"); - return KINERT; /* avoid warning */ - } - /* all ok, we pass the char */ - ch = (char) ivalue(n); + case 'x': + ch = ktok_read_hex_escape(K); break; - } default: ktok_error_extra(K, "Invalid char after '\\' " "while reading a string", ch2tv(ch)); @@ -856,7 +848,8 @@ TValue ktok_read_special(klisp_State *K) } /* Then check for simple chars, this is the only thing - that is case dependant, so after this we downcase buf */ + that is case dependant, so after this we downcase buf + (except that an escaped char needs a small 'x' */ /* REFACTOR: move this to a new function */ /* char constant, needs at least 3 chars unless it's a delimiter * char! */ @@ -865,8 +858,7 @@ TValue ktok_read_special(klisp_State *K) int ch_i = ktok_getc(K); if (ch_i == EOF) { ktok_error(K, "EOF found while reading character name"); - /* avoid warning */ - return KINERT; + return KINERT; /* avoid warning */ } ks_tbclear(K); return ch2tv((char)ch_i); @@ -879,7 +871,7 @@ TValue ktok_read_special(klisp_State *K) ** Kernel report (R-1RK)) ** For now we follow the scheme report */ - char ch = buf[2]; + char ch = buf[2]; /* we know buf_len > 2 */ if (ch < 0 || ch > 127) { ktok_error(K, "Non ASCII char found as character constant"); @@ -897,15 +889,20 @@ TValue ktok_read_special(klisp_State *K) /* fall through */ } - /* we ignore case in all remaining comparisons */ - for(char *str2 = buf; *str2 != '\0'; str2++) + /* first save the third char, in case it's an hex escaped char + (that should be a lowercase x) */ + char saved_third = buf[2]; /* there's at least 2 chars, so in the worst + case buf[2] is just '\0' */ + + /* now, we ignore case in all remaining comparisons */ + size_t i = 0; + for(char *str2 = buf; i < buf_len; ++str2, ++i) *str2 = tolower(*str2); /* REFACTOR: move this to a new function */ /* then check the known constants (including named characters) */ size_t stok_size = sizeof(kspecial_tokens) / sizeof(struct kspecial_token); - size_t i; for (i = 0; i < stok_size; i++) { struct kspecial_token token = kspecial_tokens[i]; /* NOTE: must check type because buf may contain embedded '\0's */ @@ -920,12 +917,13 @@ TValue ktok_read_special(klisp_State *K) token or a character escape */ if (buf[1] == '\\') { /* this is to have a meaningful error msg */ - if (buf[2] != 'x') {/* this will also accept 'X' */ + if (saved_third != 'x') { /* case is significant here, so + we use the saved char */ ktok_error(K, "Unrecognized character name"); return KINERT; } - /* We already checked that length != 3, so there's at least on - more char */ + /* We already checked that length != 3 (x is alphabetic), + so there's at least on more char */ TValue n; char *end; @@ -1057,26 +1055,50 @@ TValue ktok_read_special(klisp_State *K) } /* -** Identifiers +** Identifiers (and dot token) */ -TValue ktok_read_identifier(klisp_State *K) +TValue ktok_read_identifier_or_dot(klisp_State *K) { - int32_t i = 1; + bool seen_dot = false; + int32_t i = 0; while (!ktok_check_delimiter(K)) { /* NOTE: can't be eof, because eof is a delimiter */ char ch = (char) ktok_getc(K); - + /* this is needed to differentiate a dot from an equivalent escape */ + seen_dot |= ch == '.'; /* NOTE: is_subsequent of '\0' is false, so no embedded '\0' */ if (ktok_is_subsequent(ch)) { + /* downcase all non-escaped chars */ + ks_tbadd(K, tolower(ch)); + ++i; + } else if (ch == '\\') { + /* should be inline hex escape */ + ch = ktok_getc(K); + if (ch == EOF) { + ktok_error(K, "EOF found while reading character name"); + } else if (ch != 'x') { + ktok_error_extra(K, "Invalid char in identifier after \\", + ch2tv((char)ch)); + } + ch = ktok_read_hex_escape(K); + /* don't downcase escaped chars */ ks_tbadd(K, ch); - i++; - } else - ktok_error(K, "Invalid char in identifier"); + ++i; + } else { + ktok_error_extra(K, "Invalid char in identifier", + ch2tv((char)ch)); + } + } + + if (i == 1 && seen_dot) { + ks_tbclear(K); + return K->ktok_dot; } + ks_tbadd(K, '\0'); TValue si = ktok_get_source_info(K); krooted_tvs_push(K, si); /* will be popped by throw */ - TValue new_sym = ksymbol_new_i(K, ks_tbget_buffer(K), i-1, si); + TValue new_sym = ksymbol_new_bs(K, ks_tbget_buffer(K), i, si); krooted_tvs_pop(K); /* already in symbol */ krooted_tvs_push(K, new_sym); ks_tbclear(K); /* this shouldn't cause gc, but just in case */ @@ -1084,4 +1106,63 @@ TValue ktok_read_identifier(klisp_State *K) return new_sym; } +TValue ktok_read_bar_identifier(klisp_State *K) +{ + /* discard opening bar */ + ktok_getc(K); + + bool done = false; + int i = 0; + + /* Never downcase chars in |...| escaped symbols */ + while(!done) { + int ch = ktok_getc(K); + if (ch == EOF) { + ktok_error(K, "EOF found while reading an |identifier|"); + return KINERT; /* avoid warning */ + } else if (ch < 0 || ch > 127) { + ktok_error(K, "Non ASCII char found while reading an identifier"); + return KINERT; /* avoid warning */ + } + + if (ch == '|') { + ks_tbadd(K, '\0'); + done = true; + } else if (ch == '\\') { + ch = ktok_getc(K); + + if (ch == EOF) { + ktok_error(K, "EOF found while reading an |identifier|"); + return KINERT; /* avoid warning */ + } + + switch(ch) { + /* These two will self insert */ + case '|': + case '\\': + break; + case 'x': + ch = ktok_read_hex_escape(K); + break; + default: + ktok_error_extra(K, "Invalid char after '\\' " + "while reading a symbol", ch2tv(ch)); + return KINERT; /* avoid warning */ + } + ks_tbadd(K, ch); + ++i; + } else { + ks_tbadd(K, ch); + ++i; + } + } + TValue si = ktok_get_source_info(K); + krooted_tvs_push(K, si); /* will be popped by throw */ + TValue new_sym = ksymbol_new_bs(K, ks_tbget_buffer(K), i, si); + krooted_tvs_pop(K); /* already in symbol */ + krooted_tvs_push(K, new_sym); + ks_tbclear(K); /* this shouldn't cause gc, but just in case */ + krooted_tvs_pop(K); + return new_sym; +} diff --git a/src/ktoken.h b/src/ktoken.h @@ -35,10 +35,9 @@ inline int ktok_peekc(klisp_State *K) { return ktok_peekc_getc(K, true); } /* needed by the repl */ void ktok_ignore_whitespace(klisp_State *K); -/* This is needed for string->symbol to check if a symbol has external +/* This is needed for kwrite to check if a symbol has external representation as an identifier */ /* REFACTOR: think out a better interface to all this */ - /* ** Char set contains macro interface */ @@ -49,7 +48,8 @@ void ktok_ignore_whitespace(klisp_State *K); typedef uint32_t kcharset[8]; extern kcharset ktok_alphabetic, ktok_numeric, ktok_whitespace; -extern kcharset ktok_delimiter, ktok_extended, ktok_subsequent; +extern kcharset ktok_delimiter, ktok_extended; +extern kcharset ktok_subsequent, ktok_initial; #define ktok_is_alphabetic(chi_) kcharset_contains(ktok_alphabetic, chi_) #define ktok_is_numeric(chi_) kcharset_contains(ktok_numeric, chi_) @@ -57,6 +57,7 @@ extern kcharset ktok_delimiter, ktok_extended, ktok_subsequent; #define ktok_is_whitespace(chi_) kcharset_contains(ktok_whitespace, chi_) #define ktok_is_delimiter(chi_) ((chi_) == EOF || \ kcharset_contains(ktok_delimiter, chi_)) +#define ktok_is_initial(chi_) kcharset_contains(ktok_initial, chi_) #define ktok_is_subsequent(chi_) kcharset_contains(ktok_subsequent, chi_) #define kcharset_contains(kch_, ch_) \ diff --git a/src/kwrite.c b/src/kwrite.c @@ -10,6 +10,7 @@ #include <assert.h> #include <inttypes.h> #include <string.h> +#include <ctype.h> #include "kwrite.h" #include "kobject.h" @@ -26,6 +27,7 @@ #include "kenvironment.h" #include "kbytevector.h" #include "kvector.h" +#include "ktoken.h" /* for identifier checking */ /* ** Stack for the write FSM @@ -194,8 +196,8 @@ void kw_print_string(klisp_State *K, TValue str) (!K->write_displayp && (*ptr == '\\' || *ptr == '"'))); ++i, ptr++) { - /* This are all ASCII printable characters + space, except \ and - " if !displayp */ + /* This are all ASCII printable characters (including space, + and exceptuating '\' and '"' if !displayp) */ char *fmt; /* must be uint32_t to support all unicode chars in the future */ @@ -235,6 +237,95 @@ void kw_print_string(klisp_State *K, TValue str) } /* +** Helper for printing symbols. +** If symbol is not a regular identifier it +** uses the "|...|" syntax, escaping '|', '\' and +** non printing characters. +*/ +void kw_print_symbol(klisp_State *K, TValue sym) +{ + uint32_t size = ksymbol_size(sym); + char *buf = ksymbol_buf(sym); + + /* first determine if it's a simple identifier */ + bool identifierp; + if (size == 0) + identifierp = false; + else if (size == 1 && *buf == '.') + identifierp = false; + else if (size == 1 && (*buf == '+' || *buf == '-')) + identifierp = true; + else if (*buf == tolower(*buf) && ktok_is_initial(*buf)) { + char *ptr = buf; + uint32_t i = 0; + identifierp = true; + while (identifierp && i < size) { + char ch = *ptr++; + ++i; + if (tolower(ch) != ch || !ktok_is_subsequent(ch)) + identifierp = false; + } + } else + identifierp = false; + + if (identifierp) { + /* no problem, just a simple string */ + kw_printf(K, "%s", buf); + return; + } + + /* + ** In case we get here, we'll have to use the "|...|" syntax + */ + char *ptr = buf; + int i = 0; + + kw_printf(K, "|"); + + while (i < size) { + /* find the longest printf-able substring to avoid calling printf + for every char */ + for (ptr = buf; + i < size && *ptr != '\0' && + (*ptr >= 32 && *ptr < 127) && + (*ptr != '\\' && *ptr != '|'); + i++, ptr++) + ; + + /* NOTE: this work even if ptr == buf (which can only happen the + first or last time) */ + char ch = *ptr; + *ptr = '\0'; + kw_printf(K, "%s", buf); + *ptr = ch; + + for(; i < size && (*ptr == '\0' || (*ptr < 32 || *ptr >= 127) || + (*ptr == '\\' || *ptr == '|')); + ++i, ptr++) { + /* This are all ASCII printable characters (including space, + and exceptuating '\' and '|') */ + char *fmt; + /* must be uint32_t to support all unicode chars + in the future */ + uint32_t arg; + ch = *ptr; + switch(*ptr) { + /* regular \ escapes */ + case '|': fmt = "\\%c"; arg = (uint32_t) '|'; break; + case '\\': fmt = "\\%c"; arg = (uint32_t) '\\'; break; + /* for the rest of the non printable chars, + use hex escape */ + default: fmt = "\\x%x;"; arg = (uint32_t) ch; break; + } + kw_printf(K, fmt, arg); + } + buf = ptr; + } + + kw_printf(K, "|"); +} + +/* ** Mark initialization and clearing */ /* GC: root is rooted */ @@ -305,7 +396,8 @@ void kw_set_initial_marks(klisp_State *K, TValue root) #if KTRACK_NAMES void kw_print_name(klisp_State *K, TValue obj) { - kw_printf(K, ": %s", ksymbol_buf(kget_name(K, obj))); + kw_printf(K, ": "); + kw_print_symbol(K, kget_name(K, obj)); } #endif /* KTRACK_NAMES */ @@ -471,12 +563,7 @@ void kwrite_scalar(klisp_State *K, TValue obj) kw_printf(K, "#%c", bvalue(obj)? 't' : 'f'); break; case K_TSYMBOL: - if (khas_ext_rep(obj)) { - /* TEMP: access symbol structure directly */ - kw_printf(K, "%s", ksymbol_buf(obj)); - } else { - kw_printf(K, "#[symbol]"); - } + kw_print_symbol(K, obj); break; case K_TINERT: kw_printf(K, "#inert");