commit 207af60949a35641f556f5644702c2187f9e8bce
parent 9a95a9f6f843044e72cee78d5429d5b0c4481588
Author: Andres Navarro <canavarro82@gmail.com>
Date: Wed, 30 Nov 2011 07:21:34 -0300
Added reader/writer support for symbol escapes (both hex escapes and |...| escaped symbols. Bugfix: only #\x works in hex escaped chars (not #\X). Added '.' as a possible initial character for symbols This follows scheme. Some light refactoring.
Diffstat:
12 files changed, 328 insertions(+), 204 deletions(-)
diff --git a/TODO b/TODO
@@ -34,8 +34,7 @@
* applicatives:
** number->string (r7rs)
** string->number (r7rs)
-* reader
-** symbol escapes (r7rs)
+* reader/writer
** syntax support for complex numbers (Kernel report)
* other
** optional argument to member? (r7rs)
diff --git a/src/Makefile b/src/Makefile
@@ -280,7 +280,7 @@ kstate.o: kstate.c klisp.h kobject.h klimits.h klispconf.h kstate.h \
kstring.o: kstring.c kstring.h kobject.h klimits.h klisp.h klispconf.h \
kstate.h ktoken.h kmem.h kgc.h
ksymbol.o: ksymbol.c ksymbol.h kobject.h klimits.h klisp.h klispconf.h \
- kstate.h ktoken.h kmem.h kstring.h kgc.h
+ kstate.h kmem.h kstring.h kgc.h
ksystem.o: ksystem.c kobject.h klimits.h klisp.h klispconf.h kstate.h \
ktoken.h kmem.h kerror.h kpair.h kgc.h ksystem.h
ksystem.posix.o: ksystem.posix.c kobject.h klimits.h klisp.h klispconf.h \
@@ -298,7 +298,7 @@ kvector.o: kvector.c kvector.h kobject.h klimits.h klisp.h klispconf.h \
kwrite.o: kwrite.c kwrite.h kobject.h klimits.h klisp.h klispconf.h \
kstate.h ktoken.h kmem.h kinteger.h imath.h krational.h imrat.h kreal.h \
kpair.h kgc.h kstring.h ksymbol.h kerror.h ktable.h kport.h \
- kenvironment.h kbytevector.h kvector.h
+ kenvironment.h kbytevector.h kvector.h ktoken.h
imath.o: imath.c imath.h kobject.h klimits.h klisp.h klispconf.h kstate.h \
ktoken.h kmem.h kerror.h kpair.h kgc.h
imrat.o: imrat.c imrat.h imath.h kobject.h klimits.h klisp.h klispconf.h \
diff --git a/src/kghelpers.h b/src/kghelpers.h
@@ -515,7 +515,7 @@ TValue map_for_each_transpose(klisp_State *K, TValue lss,
/* TODO add si to the symbols */
#if KTRACK_SI
#define add_operative(K_, env_, n_, fn_, ...) \
- { symbol = ksymbol_new(K_, n_, KNIL); \
+ { symbol = ksymbol_new_b(K_, n_, KNIL); \
value = kmake_operative(K_, fn_, __VA_ARGS__); \
TValue str = kstring_new_b_imm(K_, __FILE__); \
TValue si = kcons(K, str, kcons(K_, i2tv(__LINE__), \
@@ -524,7 +524,7 @@ TValue map_for_each_transpose(klisp_State *K, TValue lss,
kadd_binding(K_, env_, symbol, value); }
#define add_applicative(K_, env_, n_, fn_, ...) \
- { symbol = ksymbol_new(K_, n_, KNIL); \
+ { symbol = ksymbol_new_b(K_, n_, KNIL); \
value = kmake_applicative(K_, fn_, __VA_ARGS__); \
TValue str = kstring_new_b_imm(K_, __FILE__); \
TValue si = kcons(K, str, kcons(K_, i2tv(__LINE__), \
@@ -534,19 +534,19 @@ TValue map_for_each_transpose(klisp_State *K, TValue lss,
kadd_binding(K_, env_, symbol, value); }
#else /* KTRACK_SI */
#define add_operative(K_, env_, n_, fn_, ...) \
- { symbol = ksymbol_new(K_, n_, KNIL); \
+ { symbol = ksymbol_new_b(K_, n_, KNIL); \
value = kmake_operative(K_, fn_, __VA_ARGS__); \
kadd_binding(K_, env_, symbol, value); }
#define add_applicative(K_, env_, n_, fn_, ...) \
- { symbol = ksymbol_new(K_, n_); \
+ { symbol = ksymbol_new_b(K_, n_, KNIL); \
value = kmake_applicative(K_, fn_, __VA_ARGS__); \
kadd_binding(K_, env_, symbol, value); }
#endif /* KTRACK_SI */
#define add_value(K_, env_, n_, v_) \
{ value = v_; \
- symbol = ksymbol_new(K_, n_, KNIL); \
+ symbol = ksymbol_new_b(K_, n_, KNIL); \
kadd_binding(K_, env_, symbol, v_); }
#endif
diff --git a/src/kgsymbols.c b/src/kgsymbols.c
@@ -61,7 +61,7 @@ void string_to_symbol(klisp_State *K)
UNUSED(denv);
bind_1tp(K, ptree, "string", ttisstring, str);
/* TODO si */
- TValue new_sym = ksymbol_new_check_i(K, str, KNIL);
+ TValue new_sym = ksymbol_new_str(K, str, KNIL);
kapply_cc(K, new_sym);
}
diff --git a/src/klisp.c b/src/klisp.c
@@ -548,7 +548,7 @@ static void populate_argument_lists(klisp_State *K, char **argv, int argc,
tail = kimm_cons(K, obj, tail);
}
/* Store the script argument list */
- obj = ksymbol_new(K, "get-script-arguments", KNIL);
+ obj = ksymbol_new_b(K, "get-script-arguments", KNIL);
klisp_assert(kbinds(K, K->ground_env, obj));
obj = kunwrap(kget_binding(K, K->ground_env, obj));
tv2op(obj)->extra[0] = tail;
@@ -559,7 +559,7 @@ static void populate_argument_lists(klisp_State *K, char **argv, int argc,
tail = kimm_cons(K, obj, tail);
}
/* Store the interpreter argument list */
- obj = ksymbol_new(K, "get-interpreter-arguments", KNIL);
+ obj = ksymbol_new_b(K, "get-interpreter-arguments", KNIL);
klisp_assert(kbinds(K, K->ground_env, obj));
obj = kunwrap(kget_binding(K, K->ground_env, obj));
tv2op(obj)->extra[0] = tail;
diff --git a/src/kobject.h b/src/kobject.h
@@ -822,11 +822,6 @@ int32_t kmark_count;
#define kis_mutable(o_) ((tv_get_kflags(o_) & K_FLAG_IMMUTABLE) == 0)
#define kis_immutable(o_) (!kis_mutable(o_))
-/* KFlags for symbols */
-/* has external representation (identifiers) */
-#define K_FLAG_EXT_REP 0x01
-#define khas_ext_rep(s_) ((tv_get_kflags(s_) & K_FLAG_EXT_REP) != 0)
-
/* KFlags for marking continuations */
#define K_FLAG_OUTER 0x01
#define K_FLAG_INNER 0x02
diff --git a/src/kstate.c b/src/kstate.c
@@ -223,7 +223,7 @@ klisp_State *klisp_newstate (klisp_Alloc f, void *ud) {
kset_source_info(K, K->eval_op, si);
/* TODO: si */
- TValue eval_name = ksymbol_new(K, "eval", KNIL);
+ TValue eval_name = ksymbol_new_b(K, "eval", KNIL);
ktry_set_name(K, K->eval_op, eval_name);
K->list_app = kmake_applicative(K, list, 0), line_number = __LINE__;
@@ -238,7 +238,7 @@ klisp_State *klisp_newstate (klisp_Alloc f, void *ud) {
/* MAYBE: fix it so we can remove module_params_sym from roots */
/* TODO si */
- K->module_params_sym = ksymbol_new(K, "module-parameters", KNIL);
+ K->module_params_sym = ksymbol_new_b(K, "module-parameters", KNIL);
/* Create the root and error continuation (will be added to the
environment in kinit_ground_env) */
diff --git a/src/ksymbol.c b/src/ksymbol.c
@@ -8,8 +8,6 @@
#include "ksymbol.h"
#include "kobject.h"
-/* for identifier checking */
-#include "ktoken.h"
#include "kstate.h"
#include "kmem.h"
#include "kgc.h"
@@ -19,10 +17,13 @@
/* NOTE: symbols can have source info, they should be compared with
tv_sym_equal, NOT tv_equal */
-/* TEMP: for now only interned symbols are the ones that don't
- have source info (like those created with string->symbol) */
-TValue ksymbol_new_g(klisp_State *K, const char *buf, int32_t size,
- TValue si, bool identifierp)
+/* No case folding is performed by these constructors */
+
+/*
+** Interned symbols are only the ones that don't have source info
+** (like those created with string->symbol)
+*/
+TValue ksymbol_new_bs(klisp_State *K, const char *buf, int32_t size, TValue si)
{
/* First calculate the hash */
uint32_t h = size; /* seed */
@@ -71,7 +72,7 @@ TValue ksymbol_new_g(klisp_State *K, const char *buf, int32_t size,
differently */
new_sym->gct = klispC_white(K);
new_sym->tt = K_TSYMBOL;
- new_sym->kflags = identifierp? K_FLAG_EXT_REP : 0;
+ new_sym->kflags = 0;
new_sym->si = NULL;
/* symbol specific fields */
@@ -93,8 +94,7 @@ TValue ksymbol_new_g(klisp_State *K, const char *buf, int32_t size,
} else { /* non nil source info */
/* link it with regular objects and save source info */
/* header + gc_fields */
- klispC_link(K, (GCObject *) new_sym, K_TSYMBOL,
- identifierp? K_FLAG_EXT_REP : 0);
+ klispC_link(K, (GCObject *) new_sym, K_TSYMBOL, 0);
/* symbol specific fields */
new_sym->str = new_str;
@@ -107,59 +107,18 @@ TValue ksymbol_new_g(klisp_State *K, const char *buf, int32_t size,
return ret_tv;
}
-/* for indentifiers */
-TValue ksymbol_new_i(klisp_State *K, const char *buf, int32_t size, TValue si)
-{
- return ksymbol_new_g(K, buf, size, si, true);
-}
-
-/* for indentifiers with no size */
-TValue ksymbol_new(klisp_State *K, const char *buf, TValue si)
+/* for c strings with unknown size */
+TValue ksymbol_new_b(klisp_State *K, const char *buf, TValue si)
{
int32_t size = (int32_t) strlen(buf);
- return ksymbol_new_g(K, buf, size, si, true);
+ return ksymbol_new_bs(K, buf, size, si);
}
/* for string->symbol */
/* GC: assumes str is rooted */
-TValue ksymbol_new_check_i(klisp_State *K, TValue str, TValue si)
+TValue ksymbol_new_str(klisp_State *K, TValue str, TValue si)
{
- int32_t size = kstring_size(str);
- char *buf = kstring_buf(str);
- bool identifierp;
-
- /* this is necessary because the empty symbol isn't an identifier */
- /* MAYBE it should throw an error if the string is empty */
- /* XXX: The exact syntax for identifiers isn't there in the report
- yet, here we use something like scheme, and the same as in ktoken.h
- (details, leading numbers '.', '+' and '-' are a no go, but '+' and
- '-' are an exception.
- */
- identifierp = (size > 0);
- if (identifierp) {
- char first = *buf;
- buf++;
- size--;
- if (first == '+' || first == '-')
- identifierp = (size == 0);
- else if (first == '.' || ktok_is_numeric(first))
- identifierp = false;
- else
- identifierp = ktok_is_subsequent(first);
-
- while(identifierp && size--) {
- if (!ktok_is_subsequent(*buf))
- identifierp = false;
- else
- buf++;
- }
- }
- /* recover size & buf*/
- size = kstring_size(str);
- buf = kstring_buf(str);
-
- TValue new_sym = ksymbol_new_g(K, buf, size, si, identifierp);
- return new_sym;
+ return ksymbol_new_bs(K, kstring_buf(str), kstring_size(str), si);
}
bool ksymbolp(TValue obj) { return ttissymbol(obj); }
diff --git a/src/ksymbol.h b/src/ksymbol.h
@@ -17,13 +17,15 @@
/* NOTE: symbols can have source info, they should be compared with
tv_sym_equal, NOT tv_equal */
-/* For identifiers */
-TValue ksymbol_new_i(klisp_State *K, const char *buf, int32_t size,
+/* No case folding is performed by these constructors */
+
+/* buffer + size, may contain nulls */
+TValue ksymbol_new_bs(klisp_State *K, const char *buf, int32_t size,
TValue si);
-/* For identifiers, simplified for unknown size */
-TValue ksymbol_new(klisp_State *K, const char *buf, TValue si);
-/* For general strings, copies str if not immutable */
-TValue ksymbol_new_check_i(klisp_State *K, TValue str, TValue si);
+/* null terminated buffer */
+TValue ksymbol_new_b(klisp_State *K, const char *buf, TValue si);
+/* copies str if not immutable */
+TValue ksymbol_new_str(klisp_State *K, TValue str, TValue si);
#define ksymbol_str(tv_) (tv2sym(tv_)->str)
#define ksymbol_buf(tv_) (kstring_buf(tv2sym(tv_)->str))
diff --git a/src/ktoken.c b/src/ktoken.c
@@ -5,23 +5,10 @@
*/
/*
-** Symbols should be converted to some standard case before interning
-** (in this case downcase)
-*/
-
-/*
** TODO:
**
-** From the Report:
-**
-** - Support for complete number syntax (complex)
-**
-** NOT from the Report:
+** - Support for complete number syntax (complex) (report)
** - Support for unicode (strings, char and symbols).
-** - srfi-30 stype #| ... |# nested comments and srfi-62 style #;
-** sexp comments.
-** - more named chars (like #\tab and in strings "\t")
-** - numeric escaped chars (like #\u0020)
**
*/
#include <stdio.h>
@@ -86,7 +73,8 @@ void kcharset_union(kcharset chs, kcharset chs2)
** Character sets for classification
*/
kcharset ktok_alphabetic, ktok_numeric, ktok_whitespace;
-kcharset ktok_delimiter, ktok_extended, ktok_subsequent;
+kcharset ktok_delimiter, ktok_extended;
+kcharset ktok_initial, ktok_subsequent;
/*
** Special Tokens
@@ -101,7 +89,9 @@ kcharset ktok_delimiter, ktok_extended, ktok_subsequent;
** char in the car and nil in the cdr.
** srfi-38 tokens are also represented with a char in the car indicating if
** it's a defining token ('=') or a referring token ('#') and the number in
-** the cdr. This way a special token can be easily tested for (with ttispair)
+** the cdr.
+** The sexp comment token with a ';' in the car.
+** This way a special token can be easily tested for (with ttispair)
** and easily classified (with switch(chvalue(kcar(tok)))).
**
*/
@@ -118,10 +108,18 @@ void ktok_init(klisp_State *K)
kcharset_fill(ktok_delimiter, "()\";");
kcharset_union(ktok_delimiter, ktok_whitespace);
- kcharset_fill(ktok_extended, "!$%&*+-./:<=>?@^_~");
+ kcharset_fill(ktok_initial, "!$%&*./:<=>?@^_~");
+ kcharset_union(ktok_initial, ktok_alphabetic);
+
+ /* N.B. Unlike in scheme, kernel admits both '.' and
+ '@' as initial chars in identifiers, but doesn't allow
+ '+' or '-'. There are 3 exceptions:
+ both '+' and '-' alone are identifiers and '.' alone is
+ not an identifier */
+ kcharset_fill(ktok_extended, "+-");
kcharset_empty(ktok_subsequent);
- kcharset_union(ktok_subsequent, ktok_alphabetic);
+ kcharset_union(ktok_subsequent, ktok_initial);
kcharset_union(ktok_subsequent, ktok_numeric);
kcharset_union(ktok_subsequent, ktok_extended);
}
@@ -314,13 +312,15 @@ void ktok_ignore_whitespace(klisp_State *K);
void ktok_ignore_single_line_comment(klisp_State *K);
void ktok_ignore_multi_line_comment(klisp_State *K);
bool ktok_check_delimiter(klisp_State *K);
+char ktok_read_hex_escape(klisp_State *K);
TValue ktok_read_string(klisp_State *K);
TValue ktok_read_special(klisp_State *K);
TValue ktok_read_number(klisp_State *K, char *buf, int32_t len,
bool has_exactp, bool exactp, bool has_radixp,
int32_t radix);
TValue ktok_read_maybe_signed_numeric(klisp_State *K);
-TValue ktok_read_identifier(klisp_State *K);
+TValue ktok_read_identifier_or_dot(klisp_State *K);
+TValue ktok_read_bar_identifier(klisp_State *K);
int ktok_read_until_delimiter(klisp_State *K);
/*
@@ -357,17 +357,10 @@ TValue ktok_read_token(klisp_State *K)
case ')':
ktok_getc(K);
return K->ktok_rparen;
- case '.':
- ktok_getc(K);
- if (ktok_check_delimiter(K))
- return K->ktok_dot;
- else {
- ktok_error(K, "no delimiter found after dot");
- /* avoid warning */
- return KINERT;
- }
case '"':
return ktok_read_string(K);
+ case '|':
+ return ktok_read_bar_identifier(K);
/* TODO use read_until_delimiter in all these cases */
case '#': {
ktok_getc(K);
@@ -405,6 +398,8 @@ TValue ktok_read_token(klisp_State *K)
case '+': case '-':
/* signed number, no exactness or radix indicator */
return ktok_read_maybe_signed_numeric(K);
+ case '\\': /* this is a symbol that starts with an hex escape */
+ /* These should be kept synchronized with initial */
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
@@ -416,27 +411,14 @@ TValue ktok_read_token(klisp_State *K)
case '!': case '$': case '%': case '&': case '*': case '/': case ':':
case '<': case '=': case '>': case '?': case '@': case '^': case '_':
case '~':
+ case '.': /* this is either a symbol or a dot token */
/*
- ** NOTE: the cases for '+', '-', '.' and numbers were already
- ** considered so identifier-subsequent is used instead of
- ** identifier-first-char (in the cases above)
+ ** N.B.: the cases for '+', and '-', were already
+ ** considered
*/
- return ktok_read_identifier(K);
- case '|':
- ktok_getc(K);
- chi = ktok_peekc(K);
- if (chi == EOF || chi != '#') {
- chi = '|';
- goto unrecognized_error;
- }
- ktok_getc(K);
- ktok_error(K, "unmatched multiline comment close (\"|#\")");
- /* avoid warning */
- return KINERT;
+ return ktok_read_identifier_or_dot(K);
default:
chi = ktok_getc(K);
- /* TODO add char to error */
- unrecognized_error:
ktok_error_extra(K, "unrecognized token starting char",
ch2tv((char) chi));
/* avoid warning */
@@ -610,7 +592,7 @@ TValue ktok_read_maybe_signed_numeric(klisp_State *K)
/* save the source info in the symbol */
TValue si = ktok_get_source_info(K);
krooted_tvs_push(K, si); /* will be popped by throw */
- TValue new_sym = ksymbol_new_i(K, ks_tbget_buffer(K), 1, si);
+ TValue new_sym = ksymbol_new_bs(K, ks_tbget_buffer(K), 1, si);
krooted_tvs_pop(K); /* already in symbol */
krooted_tvs_push(K, new_sym);
ks_tbclear(K); /* this shouldn't cause gc, but just in case */
@@ -626,6 +608,62 @@ TValue ktok_read_maybe_signed_numeric(klisp_State *K)
}
/*
+** Hex escapes for strings and symbols
+** "#\xXXXXXX;"
+** "#\x" already read
+*/
+
+char ktok_read_hex_escape(klisp_State *K)
+{
+ /* enough space for any unicode char + 2 */
+ int ch;
+ char buf[10];
+ int c = 0;
+ bool at_least_onep = false;
+ for(ch = ktok_getc(K); ch != EOF && ch != ';';
+ ch = ktok_getc(K)) {
+ if (!ktok_is_digit(ch, 16)) {
+ ktok_error_extra(K, "Invalid char found in hex escape",
+ ch2tv(ch));
+ return '\0'; /* avoid warning */
+ }
+ /*
+ ** This will allow one space for '\0' and one extra
+ ** char in case the value is too big, and so will
+ ** naturally result in a value outside the unicode
+ ** range without the need to record any extra
+ ** characters other than the first 8 (without
+ ** leading zeroes).
+ */
+ at_least_onep = true;
+ if (c < sizeof(buf) - 1 && (c > 0 || ch != '0'))
+ buf[c++] = ch;
+ }
+ if (ch == EOF) {
+ ktok_error(K, "EOF found while reading hex escape");
+ return '\0'; /* avoid warning */
+ } else if (!at_least_onep) {
+ ktok_error(K, "Empty hex escape found");
+ return '\0'; /* avoid warning */
+ } else if (c == 0) { /* this is the case of a NULL char */
+ buf[c++] = '0';
+ }
+ buf[c++] = '\0';
+ /* buf now contains the hex value of the char */
+ TValue n;
+ int res = kinteger_read(K, buf, 16, &n, NULL);
+ /* can't fail, all digits were checked already */
+ klisp_assert(res == true);
+ if (!ttisfixint(n) || ivalue(n) > 127) {
+ krooted_tvs_push(K, n);
+ ktok_error_extra(K, "hex escaped char out of ASCII range", n);
+ return '\0'; /* avoid warning */
+ }
+ /* all ok, we pass the char */
+ return (char) ivalue(n);
+}
+
+/*
** Strings
*/
TValue ktok_read_string(klisp_State *K)
@@ -723,55 +761,9 @@ TValue ktok_read_string(klisp_State *K)
string at the top of the loop */
goto just_read;
/* This is an hex escaped char */
- case 'x': {
- /* enough space for any unicode char + 2 */
- char buf[10];
- int c = 0;
- bool at_least_onep = false;
- for(ch = ktok_getc(K); ch != EOF && ch != ';';
- ch = ktok_getc(K)) {
- if (!ktok_is_digit(ch, 16)) {
- ktok_error_extra(K, "Invalid char found in string "
- "hex escape", ch2tv(ch));
- return KINERT; /* avoid warning */
- }
- /*
- ** This will allow one space for '\0' and one extra
- ** char in case the value is too big, and so will
- ** naturally result in a value outside the unicode
- ** range without the need to record any extra
- ** characters other than the first 8 (without
- ** leading zeroes).
- */
- at_least_onep = true;
- if (c < sizeof(buf) - 1 && (c > 0 || ch != '0'))
- buf[c++] = ch;
- }
- if (ch == EOF) {
- ktok_error(K, "EOF found while reading a string");
- return KINERT; /* avoid warning */
- } else if (!at_least_onep) {
- ktok_error(K, "Empty hex escape found while reading "
- "a string");
- return KINERT; /* avoid warning */
- } else if (c == 0) { /* this is the case of a NULL char */
- buf[c++] = '0';
- }
- buf[c++] = '\0';
- /* buf now contains the hex value of the char */
- TValue n;
- int res = kinteger_read(K, buf, 16, &n, NULL);
- /* can't fail, all digits were checked already */
- klisp_assert(res == true);
- if (!ttisfixint(n) || ivalue(n) > 127) {
- ktok_error(K, "hex escaped char out of ASCII range "
- "while reading a string");
- return KINERT; /* avoid warning */
- }
- /* all ok, we pass the char */
- ch = (char) ivalue(n);
+ case 'x':
+ ch = ktok_read_hex_escape(K);
break;
- }
default:
ktok_error_extra(K, "Invalid char after '\\' "
"while reading a string", ch2tv(ch));
@@ -856,7 +848,8 @@ TValue ktok_read_special(klisp_State *K)
}
/* Then check for simple chars, this is the only thing
- that is case dependant, so after this we downcase buf */
+ that is case dependant, so after this we downcase buf
+ (except that an escaped char needs a small 'x' */
/* REFACTOR: move this to a new function */
/* char constant, needs at least 3 chars unless it's a delimiter
* char! */
@@ -865,8 +858,7 @@ TValue ktok_read_special(klisp_State *K)
int ch_i = ktok_getc(K);
if (ch_i == EOF) {
ktok_error(K, "EOF found while reading character name");
- /* avoid warning */
- return KINERT;
+ return KINERT; /* avoid warning */
}
ks_tbclear(K);
return ch2tv((char)ch_i);
@@ -879,7 +871,7 @@ TValue ktok_read_special(klisp_State *K)
** Kernel report (R-1RK))
** For now we follow the scheme report
*/
- char ch = buf[2];
+ char ch = buf[2]; /* we know buf_len > 2 */
if (ch < 0 || ch > 127) {
ktok_error(K, "Non ASCII char found as character constant");
@@ -897,15 +889,20 @@ TValue ktok_read_special(klisp_State *K)
/* fall through */
}
- /* we ignore case in all remaining comparisons */
- for(char *str2 = buf; *str2 != '\0'; str2++)
+ /* first save the third char, in case it's an hex escaped char
+ (that should be a lowercase x) */
+ char saved_third = buf[2]; /* there's at least 2 chars, so in the worst
+ case buf[2] is just '\0' */
+
+ /* now, we ignore case in all remaining comparisons */
+ size_t i = 0;
+ for(char *str2 = buf; i < buf_len; ++str2, ++i)
*str2 = tolower(*str2);
/* REFACTOR: move this to a new function */
/* then check the known constants (including named characters) */
size_t stok_size = sizeof(kspecial_tokens) /
sizeof(struct kspecial_token);
- size_t i;
for (i = 0; i < stok_size; i++) {
struct kspecial_token token = kspecial_tokens[i];
/* NOTE: must check type because buf may contain embedded '\0's */
@@ -920,12 +917,13 @@ TValue ktok_read_special(klisp_State *K)
token or a character escape */
if (buf[1] == '\\') { /* this is to have a meaningful error msg */
- if (buf[2] != 'x') {/* this will also accept 'X' */
+ if (saved_third != 'x') { /* case is significant here, so
+ we use the saved char */
ktok_error(K, "Unrecognized character name");
return KINERT;
}
- /* We already checked that length != 3, so there's at least on
- more char */
+ /* We already checked that length != 3 (x is alphabetic),
+ so there's at least on more char */
TValue n;
char *end;
@@ -1057,26 +1055,50 @@ TValue ktok_read_special(klisp_State *K)
}
/*
-** Identifiers
+** Identifiers (and dot token)
*/
-TValue ktok_read_identifier(klisp_State *K)
+TValue ktok_read_identifier_or_dot(klisp_State *K)
{
- int32_t i = 1;
+ bool seen_dot = false;
+ int32_t i = 0;
while (!ktok_check_delimiter(K)) {
/* NOTE: can't be eof, because eof is a delimiter */
char ch = (char) ktok_getc(K);
-
+ /* this is needed to differentiate a dot from an equivalent escape */
+ seen_dot |= ch == '.';
/* NOTE: is_subsequent of '\0' is false, so no embedded '\0' */
if (ktok_is_subsequent(ch)) {
+ /* downcase all non-escaped chars */
+ ks_tbadd(K, tolower(ch));
+ ++i;
+ } else if (ch == '\\') {
+ /* should be inline hex escape */
+ ch = ktok_getc(K);
+ if (ch == EOF) {
+ ktok_error(K, "EOF found while reading character name");
+ } else if (ch != 'x') {
+ ktok_error_extra(K, "Invalid char in identifier after \\",
+ ch2tv((char)ch));
+ }
+ ch = ktok_read_hex_escape(K);
+ /* don't downcase escaped chars */
ks_tbadd(K, ch);
- i++;
- } else
- ktok_error(K, "Invalid char in identifier");
+ ++i;
+ } else {
+ ktok_error_extra(K, "Invalid char in identifier",
+ ch2tv((char)ch));
+ }
+ }
+
+ if (i == 1 && seen_dot) {
+ ks_tbclear(K);
+ return K->ktok_dot;
}
+
ks_tbadd(K, '\0');
TValue si = ktok_get_source_info(K);
krooted_tvs_push(K, si); /* will be popped by throw */
- TValue new_sym = ksymbol_new_i(K, ks_tbget_buffer(K), i-1, si);
+ TValue new_sym = ksymbol_new_bs(K, ks_tbget_buffer(K), i, si);
krooted_tvs_pop(K); /* already in symbol */
krooted_tvs_push(K, new_sym);
ks_tbclear(K); /* this shouldn't cause gc, but just in case */
@@ -1084,4 +1106,63 @@ TValue ktok_read_identifier(klisp_State *K)
return new_sym;
}
+TValue ktok_read_bar_identifier(klisp_State *K)
+{
+ /* discard opening bar */
+ ktok_getc(K);
+
+ bool done = false;
+ int i = 0;
+
+ /* Never downcase chars in |...| escaped symbols */
+ while(!done) {
+ int ch = ktok_getc(K);
+ if (ch == EOF) {
+ ktok_error(K, "EOF found while reading an |identifier|");
+ return KINERT; /* avoid warning */
+ } else if (ch < 0 || ch > 127) {
+ ktok_error(K, "Non ASCII char found while reading an identifier");
+ return KINERT; /* avoid warning */
+ }
+
+ if (ch == '|') {
+ ks_tbadd(K, '\0');
+ done = true;
+ } else if (ch == '\\') {
+ ch = ktok_getc(K);
+
+ if (ch == EOF) {
+ ktok_error(K, "EOF found while reading an |identifier|");
+ return KINERT; /* avoid warning */
+ }
+
+ switch(ch) {
+ /* These two will self insert */
+ case '|':
+ case '\\':
+ break;
+ case 'x':
+ ch = ktok_read_hex_escape(K);
+ break;
+ default:
+ ktok_error_extra(K, "Invalid char after '\\' "
+ "while reading a symbol", ch2tv(ch));
+ return KINERT; /* avoid warning */
+ }
+ ks_tbadd(K, ch);
+ ++i;
+ } else {
+ ks_tbadd(K, ch);
+ ++i;
+ }
+ }
+ TValue si = ktok_get_source_info(K);
+ krooted_tvs_push(K, si); /* will be popped by throw */
+ TValue new_sym = ksymbol_new_bs(K, ks_tbget_buffer(K), i, si);
+ krooted_tvs_pop(K); /* already in symbol */
+ krooted_tvs_push(K, new_sym);
+ ks_tbclear(K); /* this shouldn't cause gc, but just in case */
+ krooted_tvs_pop(K);
+ return new_sym;
+}
diff --git a/src/ktoken.h b/src/ktoken.h
@@ -35,10 +35,9 @@ inline int ktok_peekc(klisp_State *K) { return ktok_peekc_getc(K, true); }
/* needed by the repl */
void ktok_ignore_whitespace(klisp_State *K);
-/* This is needed for string->symbol to check if a symbol has external
+/* This is needed for kwrite to check if a symbol has external
representation as an identifier */
/* REFACTOR: think out a better interface to all this */
-
/*
** Char set contains macro interface
*/
@@ -49,7 +48,8 @@ void ktok_ignore_whitespace(klisp_State *K);
typedef uint32_t kcharset[8];
extern kcharset ktok_alphabetic, ktok_numeric, ktok_whitespace;
-extern kcharset ktok_delimiter, ktok_extended, ktok_subsequent;
+extern kcharset ktok_delimiter, ktok_extended;
+extern kcharset ktok_subsequent, ktok_initial;
#define ktok_is_alphabetic(chi_) kcharset_contains(ktok_alphabetic, chi_)
#define ktok_is_numeric(chi_) kcharset_contains(ktok_numeric, chi_)
@@ -57,6 +57,7 @@ extern kcharset ktok_delimiter, ktok_extended, ktok_subsequent;
#define ktok_is_whitespace(chi_) kcharset_contains(ktok_whitespace, chi_)
#define ktok_is_delimiter(chi_) ((chi_) == EOF || \
kcharset_contains(ktok_delimiter, chi_))
+#define ktok_is_initial(chi_) kcharset_contains(ktok_initial, chi_)
#define ktok_is_subsequent(chi_) kcharset_contains(ktok_subsequent, chi_)
#define kcharset_contains(kch_, ch_) \
diff --git a/src/kwrite.c b/src/kwrite.c
@@ -10,6 +10,7 @@
#include <assert.h>
#include <inttypes.h>
#include <string.h>
+#include <ctype.h>
#include "kwrite.h"
#include "kobject.h"
@@ -26,6 +27,7 @@
#include "kenvironment.h"
#include "kbytevector.h"
#include "kvector.h"
+#include "ktoken.h" /* for identifier checking */
/*
** Stack for the write FSM
@@ -194,8 +196,8 @@ void kw_print_string(klisp_State *K, TValue str)
(!K->write_displayp &&
(*ptr == '\\' || *ptr == '"')));
++i, ptr++) {
- /* This are all ASCII printable characters + space, except \ and
- " if !displayp */
+ /* This are all ASCII printable characters (including space,
+ and exceptuating '\' and '"' if !displayp) */
char *fmt;
/* must be uint32_t to support all unicode chars
in the future */
@@ -235,6 +237,95 @@ void kw_print_string(klisp_State *K, TValue str)
}
/*
+** Helper for printing symbols.
+** If symbol is not a regular identifier it
+** uses the "|...|" syntax, escaping '|', '\' and
+** non printing characters.
+*/
+void kw_print_symbol(klisp_State *K, TValue sym)
+{
+ uint32_t size = ksymbol_size(sym);
+ char *buf = ksymbol_buf(sym);
+
+ /* first determine if it's a simple identifier */
+ bool identifierp;
+ if (size == 0)
+ identifierp = false;
+ else if (size == 1 && *buf == '.')
+ identifierp = false;
+ else if (size == 1 && (*buf == '+' || *buf == '-'))
+ identifierp = true;
+ else if (*buf == tolower(*buf) && ktok_is_initial(*buf)) {
+ char *ptr = buf;
+ uint32_t i = 0;
+ identifierp = true;
+ while (identifierp && i < size) {
+ char ch = *ptr++;
+ ++i;
+ if (tolower(ch) != ch || !ktok_is_subsequent(ch))
+ identifierp = false;
+ }
+ } else
+ identifierp = false;
+
+ if (identifierp) {
+ /* no problem, just a simple string */
+ kw_printf(K, "%s", buf);
+ return;
+ }
+
+ /*
+ ** In case we get here, we'll have to use the "|...|" syntax
+ */
+ char *ptr = buf;
+ int i = 0;
+
+ kw_printf(K, "|");
+
+ while (i < size) {
+ /* find the longest printf-able substring to avoid calling printf
+ for every char */
+ for (ptr = buf;
+ i < size && *ptr != '\0' &&
+ (*ptr >= 32 && *ptr < 127) &&
+ (*ptr != '\\' && *ptr != '|');
+ i++, ptr++)
+ ;
+
+ /* NOTE: this work even if ptr == buf (which can only happen the
+ first or last time) */
+ char ch = *ptr;
+ *ptr = '\0';
+ kw_printf(K, "%s", buf);
+ *ptr = ch;
+
+ for(; i < size && (*ptr == '\0' || (*ptr < 32 || *ptr >= 127) ||
+ (*ptr == '\\' || *ptr == '|'));
+ ++i, ptr++) {
+ /* This are all ASCII printable characters (including space,
+ and exceptuating '\' and '|') */
+ char *fmt;
+ /* must be uint32_t to support all unicode chars
+ in the future */
+ uint32_t arg;
+ ch = *ptr;
+ switch(*ptr) {
+ /* regular \ escapes */
+ case '|': fmt = "\\%c"; arg = (uint32_t) '|'; break;
+ case '\\': fmt = "\\%c"; arg = (uint32_t) '\\'; break;
+ /* for the rest of the non printable chars,
+ use hex escape */
+ default: fmt = "\\x%x;"; arg = (uint32_t) ch; break;
+ }
+ kw_printf(K, fmt, arg);
+ }
+ buf = ptr;
+ }
+
+ kw_printf(K, "|");
+}
+
+/*
** Mark initialization and clearing
*/
/* GC: root is rooted */
@@ -305,7 +396,8 @@ void kw_set_initial_marks(klisp_State *K, TValue root)
#if KTRACK_NAMES
void kw_print_name(klisp_State *K, TValue obj)
{
- kw_printf(K, ": %s", ksymbol_buf(kget_name(K, obj)));
+ kw_printf(K, ": ");
+ kw_print_symbol(K, kget_name(K, obj));
}
#endif /* KTRACK_NAMES */
@@ -471,12 +563,7 @@ void kwrite_scalar(klisp_State *K, TValue obj)
kw_printf(K, "#%c", bvalue(obj)? 't' : 'f');
break;
case K_TSYMBOL:
- if (khas_ext_rep(obj)) {
- /* TEMP: access symbol structure directly */
- kw_printf(K, "%s", ksymbol_buf(obj));
- } else {
- kw_printf(K, "#[symbol]");
- }
+ kw_print_symbol(K, obj);
break;
case K_TINERT:
kw_printf(K, "#inert");