Added reader/writer support for symbol escapes (both hex escapes and |...| escaped symbols. Bugfix: only #\x works in hex escaped chars (not #\X). Added '.' as a possible initial character for symbols This follows scheme. Some light refactoring. - klisp - an open source interpreter for the Kernel Programming Language.

commit 207af60949a35641f556f5644702c2187f9e8bce
parent 9a95a9f6f843044e72cee78d5429d5b0c4481588
Author: Andres Navarro <canavarro82@gmail.com>
Date:   Wed, 30 Nov 2011 07:21:34 -0300

Added reader/writer support for symbol escapes (both hex escapes and |...| escaped symbols. Bugfix: only #\x works in hex escaped chars (not #\X). Added '.' as a possible initial character for symbols This follows scheme. Some light refactoring.

Diffstat:
M TODO  | 3 +--
M src/Makefile  | 4 ++--
M src/kghelpers.h  | 10 +++++-----
M src/kgsymbols.c  | 2 +-
M src/klisp.c  | 4 ++--
M src/kobject.h  | 5 -----
M src/kstate.c  | 4 ++--
M src/ksymbol.c  | 69 ++++++++++++++-------------------------------------------------------
M src/ksymbol.h  | 14 ++++++++------
M src/ktoken.c  | 305 ++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------
M src/ktoken.h  | 7 ++++---
M src/kwrite.c  | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------

12 files changed, 328 insertions(+), 204 deletions(-)
diff --git a/TODO b/TODO
@@ -34,8 +34,7 @@
 * applicatives: 
 ** number->string (r7rs)
 ** string->number (r7rs)
-* reader
-** symbol escapes (r7rs)
+* reader/writer
 ** syntax support for complex numbers (Kernel report)
 * other
 ** optional argument to member? (r7rs)
diff --git a/src/Makefile b/src/Makefile
@@ -280,7 +280,7 @@ kstate.o: kstate.c klisp.h kobject.h klimits.h klispconf.h kstate.h \
 kstring.o: kstring.c kstring.h kobject.h klimits.h klisp.h klispconf.h \
  kstate.h ktoken.h kmem.h kgc.h
 ksymbol.o: ksymbol.c ksymbol.h kobject.h klimits.h klisp.h klispconf.h \
- kstate.h ktoken.h kmem.h kstring.h kgc.h
+ kstate.h kmem.h kstring.h kgc.h
 ksystem.o: ksystem.c kobject.h klimits.h klisp.h klispconf.h kstate.h \
  ktoken.h kmem.h kerror.h kpair.h kgc.h ksystem.h
 ksystem.posix.o: ksystem.posix.c kobject.h klimits.h klisp.h klispconf.h \
@@ -298,7 +298,7 @@ kvector.o: kvector.c kvector.h kobject.h klimits.h klisp.h klispconf.h \
 kwrite.o: kwrite.c kwrite.h kobject.h klimits.h klisp.h klispconf.h \
  kstate.h ktoken.h kmem.h kinteger.h imath.h krational.h imrat.h kreal.h \
  kpair.h kgc.h kstring.h ksymbol.h kerror.h ktable.h kport.h \
- kenvironment.h kbytevector.h kvector.h
+ kenvironment.h kbytevector.h kvector.h ktoken.h
 imath.o: imath.c imath.h kobject.h klimits.h klisp.h klispconf.h kstate.h \
  ktoken.h kmem.h kerror.h kpair.h kgc.h
 imrat.o: imrat.c imrat.h imath.h kobject.h klimits.h klisp.h klispconf.h \
diff --git a/src/kghelpers.h b/src/kghelpers.h
@@ -515,7 +515,7 @@ TValue map_for_each_transpose(klisp_State *K, TValue lss,
 /* TODO add si to the symbols */
 #if KTRACK_SI
 #define add_operative(K_, env_, n_, fn_, ...)		\
-    { symbol = ksymbol_new(K_, n_, KNIL);		\
+    { symbol = ksymbol_new_b(K_, n_, KNIL);		\
     value = kmake_operative(K_, fn_, __VA_ARGS__);	\
     TValue str = kstring_new_b_imm(K_, __FILE__);	\
     TValue si = kcons(K, str, kcons(K_, i2tv(__LINE__),	\
@@ -524,7 +524,7 @@ TValue map_for_each_transpose(klisp_State *K, TValue lss,
     kadd_binding(K_, env_, symbol, value); }
 
 #define add_applicative(K_, env_, n_, fn_, ...)				\
-    { symbol = ksymbol_new(K_, n_, KNIL);				\
+    { symbol = ksymbol_new_b(K_, n_, KNIL);				\
 	value = kmake_applicative(K_, fn_, __VA_ARGS__);		\
 	TValue str = kstring_new_b_imm(K_, __FILE__);			\
 	TValue si = kcons(K, str, kcons(K_, i2tv(__LINE__),		\
@@ -534,19 +534,19 @@ TValue map_for_each_transpose(klisp_State *K, TValue lss,
 	kadd_binding(K_, env_, symbol, value); }
 #else /* KTRACK_SI */
 #define add_operative(K_, env_, n_, fn_, ...)		\
-    { symbol = ksymbol_new(K_, n_, KNIL);		\
+    { symbol = ksymbol_new_b(K_, n_, KNIL);		\
 	value = kmake_operative(K_, fn_, __VA_ARGS__);	\
 	kadd_binding(K_, env_, symbol, value); }
 
 #define add_applicative(K_, env_, n_, fn_, ...)			\
-    { symbol = ksymbol_new(K_, n_);				\
+    { symbol = ksymbol_new_b(K_, n_, KNIL);				\
 	value = kmake_applicative(K_, fn_, __VA_ARGS__);	\
 	kadd_binding(K_, env_, symbol, value); }
 #endif /* KTRACK_SI */
 
 #define add_value(K_, env_, n_, v_)			\
     { value = v_;					\
-	symbol = ksymbol_new(K_, n_, KNIL);		\
+	symbol = ksymbol_new_b(K_, n_, KNIL);		\
 	kadd_binding(K_, env_, symbol, v_); }
 
 #endif
diff --git a/src/kgsymbols.c b/src/kgsymbols.c
@@ -61,7 +61,7 @@ void string_to_symbol(klisp_State *K)
     UNUSED(denv);
     bind_1tp(K, ptree, "string", ttisstring, str);
     /* TODO si */
-    TValue new_sym = ksymbol_new_check_i(K, str, KNIL);
+    TValue new_sym = ksymbol_new_str(K, str, KNIL);
     kapply_cc(K, new_sym);
 }
 
diff --git a/src/klisp.c b/src/klisp.c
@@ -548,7 +548,7 @@ static void populate_argument_lists(klisp_State *K, char **argv, int argc,
 	tail = kimm_cons(K, obj, tail);
     }
     /* Store the script argument list */
-    obj = ksymbol_new(K, "get-script-arguments", KNIL);
+    obj = ksymbol_new_b(K, "get-script-arguments", KNIL);
     klisp_assert(kbinds(K, K->ground_env, obj));
     obj = kunwrap(kget_binding(K, K->ground_env, obj));
     tv2op(obj)->extra[0] = tail;
@@ -559,7 +559,7 @@ static void populate_argument_lists(klisp_State *K, char **argv, int argc,
 	tail = kimm_cons(K, obj, tail);
     }
     /* Store the interpreter argument list */
-    obj = ksymbol_new(K, "get-interpreter-arguments", KNIL);
+    obj = ksymbol_new_b(K, "get-interpreter-arguments", KNIL);
     klisp_assert(kbinds(K, K->ground_env, obj));
     obj = kunwrap(kget_binding(K, K->ground_env, obj));
     tv2op(obj)->extra[0] = tail;
diff --git a/src/kobject.h b/src/kobject.h
@@ -822,11 +822,6 @@ int32_t kmark_count;
 #define kis_mutable(o_) ((tv_get_kflags(o_) & K_FLAG_IMMUTABLE) == 0)
 #define kis_immutable(o_) (!kis_mutable(o_))
 
-/* KFlags for symbols */
-/* has external representation (identifiers) */
-#define K_FLAG_EXT_REP 0x01
-#define khas_ext_rep(s_) ((tv_get_kflags(s_) & K_FLAG_EXT_REP) != 0)
-
 /* KFlags for marking continuations */
 #define K_FLAG_OUTER 0x01
 #define K_FLAG_INNER 0x02
diff --git a/src/kstate.c b/src/kstate.c
@@ -223,7 +223,7 @@ klisp_State *klisp_newstate (klisp_Alloc f, void *ud) {
     kset_source_info(K, K->eval_op, si);
 
     /* TODO: si */
-    TValue eval_name = ksymbol_new(K, "eval", KNIL);
+    TValue eval_name = ksymbol_new_b(K, "eval", KNIL);
     ktry_set_name(K, K->eval_op, eval_name);
     
     K->list_app = kmake_applicative(K, list, 0), line_number = __LINE__;
@@ -238,7 +238,7 @@ klisp_State *klisp_newstate (klisp_Alloc f, void *ud) {
 
     /* MAYBE: fix it so we can remove module_params_sym from roots */
     /* TODO si */
-    K->module_params_sym = ksymbol_new(K, "module-parameters", KNIL);
+    K->module_params_sym = ksymbol_new_b(K, "module-parameters", KNIL);
 
     /* Create the root and error continuation (will be added to the 
      environment in kinit_ground_env) */
diff --git a/src/ksymbol.c b/src/ksymbol.c
@@ -8,8 +8,6 @@
 
 #include "ksymbol.h"
 #include "kobject.h"
-/* for identifier checking */
-#include "ktoken.h"
 #include "kstate.h"
 #include "kmem.h"
 #include "kgc.h"
@@ -19,10 +17,13 @@
 /* NOTE: symbols can have source info, they should be compared with
    tv_sym_equal, NOT tv_equal */
 
-/* TEMP: for now only interned symbols are the ones that don't 
-   have source info (like those created with string->symbol) */
-TValue ksymbol_new_g(klisp_State *K, const char *buf, int32_t size, 
-		     TValue si, bool identifierp)
+/* No case folding is performed by these constructors */
+
+/* 
+** Interned symbols are only the ones that don't have source info 
+** (like those created with string->symbol) 
+*/
+TValue ksymbol_new_bs(klisp_State *K, const char *buf, int32_t size, TValue si)
 {
     /* First calculate the hash */
     uint32_t h = size; /* seed */
@@ -71,7 +72,7 @@ TValue ksymbol_new_g(klisp_State *K, const char *buf, int32_t size,
 	   differently */
 	new_sym->gct = klispC_white(K);
 	new_sym->tt = K_TSYMBOL;
-	new_sym->kflags = identifierp? K_FLAG_EXT_REP : 0;
+	new_sym->kflags = 0;
 	new_sym->si = NULL;
 
 	/* symbol specific fields */
@@ -93,8 +94,7 @@ TValue ksymbol_new_g(klisp_State *K, const char *buf, int32_t size,
     } else { /* non nil source info */
 	/* link it with regular objects and save source info */
 	/* header + gc_fields */
-	klispC_link(K, (GCObject *) new_sym, K_TSYMBOL,
-	    identifierp? K_FLAG_EXT_REP : 0);
+	klispC_link(K, (GCObject *) new_sym, K_TSYMBOL, 0);
 	
 	/* symbol specific fields */
 	new_sym->str = new_str;
@@ -107,59 +107,18 @@ TValue ksymbol_new_g(klisp_State *K, const char *buf, int32_t size,
     return ret_tv;
 }
 
-/* for indentifiers */
-TValue ksymbol_new_i(klisp_State *K, const char *buf, int32_t size, TValue si)
-{
-    return ksymbol_new_g(K, buf, size, si, true);
-}
-
-/* for indentifiers with no size */
-TValue ksymbol_new(klisp_State *K, const char *buf, TValue si)
+/* for c strings with unknown size */
+TValue ksymbol_new_b(klisp_State *K, const char *buf, TValue si)
 {
     int32_t size = (int32_t) strlen(buf);
-    return ksymbol_new_g(K, buf, size, si, true);
+    return ksymbol_new_bs(K, buf, size, si);
 }
 
 /* for string->symbol */
 /* GC: assumes str is rooted */
-TValue ksymbol_new_check_i(klisp_State *K, TValue str, TValue si)
+TValue ksymbol_new_str(klisp_State *K, TValue str, TValue si)
 {
-    int32_t size = kstring_size(str);
-    char *buf = kstring_buf(str);
-    bool identifierp;
-    
-    /* this is necessary because the empty symbol isn't an identifier */
-    /* MAYBE it should throw an error if the string is empty */
-    /* XXX: The exact syntax for identifiers isn't there in the report
-       yet, here we use something like scheme, and the same as in ktoken.h
-       (details, leading numbers '.', '+' and '-' are a no go, but '+' and
-       '-' are an exception.
-    */
-    identifierp = (size > 0);
-    if (identifierp) {
-	char first = *buf;
-	buf++;
-	size--;
-	if (first == '+' || first == '-')
-	    identifierp = (size == 0);
-	else if (first == '.' || ktok_is_numeric(first))
-	    identifierp = false;
-	else 
-	    identifierp = ktok_is_subsequent(first);
-
-	while(identifierp && size--) {
-	    if (!ktok_is_subsequent(*buf))
-		identifierp = false;
-	    else
-		buf++;
-	}
-    }
-    /* recover size & buf*/
-    size = kstring_size(str);
-    buf = kstring_buf(str);
-
-    TValue new_sym = ksymbol_new_g(K, buf, size, si, identifierp);
-    return new_sym;
+    return ksymbol_new_bs(K, kstring_buf(str), kstring_size(str), si);
 }
 
 bool ksymbolp(TValue obj) { return ttissymbol(obj); }
diff --git a/src/ksymbol.h b/src/ksymbol.h
@@ -17,13 +17,15 @@
 /* NOTE: symbols can have source info, they should be compared with
    tv_sym_equal, NOT tv_equal */
 
-/* For identifiers */
-TValue ksymbol_new_i(klisp_State *K, const char *buf, int32_t size,
+/* No case folding is performed by these constructors */
+
+/* buffer + size, may contain nulls */
+TValue ksymbol_new_bs(klisp_State *K, const char *buf, int32_t size,
     TValue si);
-/* For identifiers, simplified for unknown size */
-TValue ksymbol_new(klisp_State *K, const char *buf, TValue si);
-/* For general strings, copies str if not immutable */
-TValue ksymbol_new_check_i(klisp_State *K, TValue str, TValue si);
+/* null terminated buffer */
+TValue ksymbol_new_b(klisp_State *K, const char *buf, TValue si);
+/* copies str if not immutable */
+TValue ksymbol_new_str(klisp_State *K, TValue str, TValue si);
 
 #define ksymbol_str(tv_) (tv2sym(tv_)->str)
 #define ksymbol_buf(tv_) (kstring_buf(tv2sym(tv_)->str))
diff --git a/src/ktoken.c b/src/ktoken.c
@@ -5,23 +5,10 @@
 */
 
 /*
-** Symbols should be converted to some standard case before interning
-** (in this case downcase)
-*/
-
-/*
 ** TODO:
 **
-** From the Report:
-**
-** - Support for complete number syntax (complex)
-** 
-** NOT from the Report:
+** - Support for complete number syntax (complex) (report)
 ** - Support for unicode (strings, char and symbols).
-** - srfi-30 stype #| ... |# nested comments and srfi-62 style #;
-**    sexp comments.
-** - more named chars (like #\tab and in strings "\t")
-** - numeric escaped chars (like #\u0020)
 **
 */
 #include <stdio.h>
@@ -86,7 +73,8 @@ void kcharset_union(kcharset chs, kcharset chs2)
 ** Character sets for classification
 */
 kcharset ktok_alphabetic, ktok_numeric, ktok_whitespace;
-kcharset ktok_delimiter, ktok_extended, ktok_subsequent;
+kcharset ktok_delimiter, ktok_extended;
+kcharset ktok_initial, ktok_subsequent;
 
 /*
 ** Special Tokens 
@@ -101,7 +89,9 @@ kcharset ktok_delimiter, ktok_extended, ktok_subsequent;
 ** char in the car and nil in the cdr.
 ** srfi-38 tokens are also represented with a char in the car indicating if 
 ** it's a defining token ('=') or a referring token ('#') and the number in 
-** the cdr. This way a special token can be easily tested for (with ttispair)
+** the cdr. 
+** The sexp comment token with a ';' in the car.
+** This way a special token can be easily tested for (with ttispair)
 ** and easily classified (with switch(chvalue(kcar(tok)))).
 **
 */
@@ -118,10 +108,18 @@ void ktok_init(klisp_State *K)
     kcharset_fill(ktok_delimiter, "()\";");
     kcharset_union(ktok_delimiter, ktok_whitespace);
 
-    kcharset_fill(ktok_extended, "!$%&*+-./:<=>?@^_~");
+    kcharset_fill(ktok_initial, "!$%&*./:<=>?@^_~");
+    kcharset_union(ktok_initial, ktok_alphabetic);
+
+    /* N.B. Unlike in scheme, kernel admits both '.' and 
+       '@' as initial chars in identifiers, but doesn't allow
+       '+' or '-'. There are 3 exceptions:
+       both '+' and '-' alone are identifiers and '.' alone is
+       not an identifier */
+    kcharset_fill(ktok_extended, "+-");
 
     kcharset_empty(ktok_subsequent);
-    kcharset_union(ktok_subsequent, ktok_alphabetic);
+    kcharset_union(ktok_subsequent, ktok_initial);
     kcharset_union(ktok_subsequent, ktok_numeric);
     kcharset_union(ktok_subsequent, ktok_extended);
 }
@@ -314,13 +312,15 @@ void ktok_ignore_whitespace(klisp_State *K);
 void ktok_ignore_single_line_comment(klisp_State *K);
 void ktok_ignore_multi_line_comment(klisp_State *K);
 bool ktok_check_delimiter(klisp_State *K);
+char ktok_read_hex_escape(klisp_State *K);
 TValue ktok_read_string(klisp_State *K);
 TValue ktok_read_special(klisp_State *K);
 TValue ktok_read_number(klisp_State *K, char *buf, int32_t len,
 			bool has_exactp, bool exactp, bool has_radixp, 
 			int32_t radix);
 TValue ktok_read_maybe_signed_numeric(klisp_State *K);
-TValue ktok_read_identifier(klisp_State *K);
+TValue ktok_read_identifier_or_dot(klisp_State *K);
+TValue ktok_read_bar_identifier(klisp_State *K);
 int ktok_read_until_delimiter(klisp_State *K);
 
 /*
@@ -357,17 +357,10 @@ TValue ktok_read_token(klisp_State *K)
 	case ')':
 	    ktok_getc(K);
 	    return K->ktok_rparen;
-	case '.':
-	    ktok_getc(K);
-	    if (ktok_check_delimiter(K))
-		return K->ktok_dot;
-	    else {
-		ktok_error(K, "no delimiter found after dot");
-		/* avoid warning */
-		return KINERT;
-	    }
 	case '"':
 	    return ktok_read_string(K);
+	case '|':
+	    return ktok_read_bar_identifier(K);
 /* TODO use read_until_delimiter in all these cases */
 	case '#': {
 	    ktok_getc(K);
@@ -405,6 +398,8 @@ TValue ktok_read_token(klisp_State *K)
 	case '+': case '-':
 	    /* signed number, no exactness or radix indicator */
 	    return ktok_read_maybe_signed_numeric(K);
+	case '\\': /* this is a symbol that starts with an hex escape */
+	    /* These should be kept synchronized with initial */
 	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 
 	case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': 
 	case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': 
@@ -416,27 +411,14 @@ TValue ktok_read_token(klisp_State *K)
 	case '!': case '$': case '%': case '&': case '*': case '/': case ':': 
 	case '<': case '=': case '>': case '?': case '@': case '^': case '_': 
 	case '~': 
+	case '.': /* this is either a symbol or a dot token */
 	    /*
-	    ** NOTE: the cases for '+', '-', '.' and numbers were already 
-	    ** considered so identifier-subsequent is used instead of 
-	    ** identifier-first-char (in the cases above)
+	    ** N.B.: the cases for '+', and '-', were already 
+	    ** considered 
 	    */
-	    return ktok_read_identifier(K);
-	case '|':
-	    ktok_getc(K);
-	    chi = ktok_peekc(K);
-	    if (chi == EOF || chi != '#') {
-		chi = '|';
-		goto unrecognized_error;
-	    }
-	    ktok_getc(K);
- 	    ktok_error(K, "unmatched multiline comment close (\"|#\")");
-	    /* avoid warning */
-	    return KINERT;
+	    return ktok_read_identifier_or_dot(K);
 	default:
 	    chi = ktok_getc(K);
-	    /* TODO add char to error */
-	unrecognized_error:
 	    ktok_error_extra(K, "unrecognized token starting char", 
 			     ch2tv((char) chi));
 	    /* avoid warning */
@@ -610,7 +592,7 @@ TValue ktok_read_maybe_signed_numeric(klisp_State *K)
 	/* save the source info in the symbol */
 	TValue si = ktok_get_source_info(K);
 	krooted_tvs_push(K, si); /* will be popped by throw */
-	TValue new_sym = ksymbol_new_i(K, ks_tbget_buffer(K), 1, si);
+	TValue new_sym = ksymbol_new_bs(K, ks_tbget_buffer(K), 1, si);
 	krooted_tvs_pop(K); /* already in symbol */
 	krooted_tvs_push(K, new_sym);
 	ks_tbclear(K); /* this shouldn't cause gc, but just in case */
@@ -626,6 +608,62 @@ TValue ktok_read_maybe_signed_numeric(klisp_State *K)
 }
 
 /*
+** Hex escapes for strings and symbols
+** "#\xXXXXXX;"
+** "#\x" already read
+*/
+
+char ktok_read_hex_escape(klisp_State *K)
+{
+    /* enough space for any unicode char + 2 */
+    int ch;
+    char buf[10];
+    int c = 0;
+    bool at_least_onep = false;
+    for(ch = ktok_getc(K); ch != EOF && ch != ';'; 
+	ch = ktok_getc(K)) {
+	if (!ktok_is_digit(ch, 16)) {
+	    ktok_error_extra(K, "Invalid char found in hex escape", 
+			     ch2tv(ch));
+	    return '\0'; /* avoid warning */
+	} 
+	/* 
+	** This will allow one space for '\0' and one extra
+	** char in case the value is too big, and so will 
+	** naturally result in a value outside the unicode
+	** range without the need to record any extra 
+	** characters other than the first 8 (without 
+	** leading zeroes).
+	*/
+	at_least_onep = true;
+	if (c < sizeof(buf) - 1 && (c > 0 || ch != '0')) 
+	    buf[c++] = ch;
+    }
+    if (ch == EOF) {
+	ktok_error(K, "EOF found while reading hex escape");
+	return '\0'; /* avoid warning */
+    } else if (!at_least_onep) {
+	ktok_error(K, "Empty hex escape found");
+	return '\0'; /* avoid warning */
+    } else if (c == 0) { /* this is the case of a NULL char */
+	buf[c++] = '0'; 
+    }
+    buf[c++] = '\0';
+    /* buf now contains the hex value of the char */
+    TValue n;
+    int res = kinteger_read(K, buf, 16, &n, NULL);
+    /* can't fail, all digits were checked already */
+    klisp_assert(res == true);
+    if (!ttisfixint(n) || ivalue(n) > 127) {
+	krooted_tvs_push(K, n);
+	ktok_error_extra(K, "hex escaped char out of ASCII range", n);
+	return '\0'; /* avoid warning */
+    }
+    /* all ok, we pass the char */
+    return (char) ivalue(n);
+}
+
+/*
 ** Strings
 */
 TValue ktok_read_string(klisp_State *K)
@@ -723,55 +761,9 @@ TValue ktok_read_string(klisp_State *K)
 		   string at the top of the loop */
 		goto just_read;
 		/* This is an hex escaped char */
-	    case 'x': {	   
-		/* enough space for any unicode char + 2 */
-		char buf[10];
-		int c = 0;
-		bool at_least_onep = false;
-		for(ch = ktok_getc(K); ch != EOF && ch != ';'; 
-		      ch = ktok_getc(K)) {
-		    if (!ktok_is_digit(ch, 16)) {
-			ktok_error_extra(K, "Invalid char found in string "
-					 "hex escape", ch2tv(ch));
-			return KINERT; /* avoid warning */
-		    } 
-		    /* 
-		    ** This will allow one space for '\0' and one extra
-		    ** char in case the value is too big, and so will 
-		    ** naturally result in a value outside the unicode
-		    ** range without the need to record any extra 
-		    ** characters other than the first 8 (without 
-		    ** leading zeroes).
-		    */
-		    at_least_onep = true;
-		    if (c < sizeof(buf) - 1 && (c > 0 || ch != '0')) 
-			buf[c++] = ch;
-		}
-		if (ch == EOF) {
-		    ktok_error(K, "EOF found while reading a string");
-		    return KINERT; /* avoid warning */
-		} else if (!at_least_onep) {
-		    ktok_error(K, "Empty hex escape found while reading "
-			       "a string");
-		    return KINERT; /* avoid warning */
-		} else if (c == 0) { /* this is the case of a NULL char */
-		    buf[c++] = '0'; 
-		}
-		buf[c++] = '\0';
-		/* buf now contains the hex value of the char */
-		TValue n;
-		int res = kinteger_read(K, buf, 16, &n, NULL);
-		/* can't fail, all digits were checked already */
-		klisp_assert(res == true);
-		if (!ttisfixint(n) || ivalue(n) > 127) {
-		    ktok_error(K, "hex escaped char out of ASCII range "
-			       "while reading a string");
-		    return KINERT; /* avoid warning */
-		}
-		/* all ok, we pass the char */
-		ch = (char) ivalue(n);
+	    case 'x': 
+		ch = ktok_read_hex_escape(K);
 		break;
-	    }
 	    default:
 		ktok_error_extra(K, "Invalid char after '\\' " 
 				 "while reading a string", ch2tv(ch));
@@ -856,7 +848,8 @@ TValue ktok_read_special(klisp_State *K)
     }
 
     /* Then check for simple chars, this is the only thing
-       that is case dependant, so after this we downcase buf */
+       that is case dependant, so after this we downcase buf
+       (except that an escaped char needs a small 'x' */
     /* REFACTOR: move this to a new function */
     /* char constant, needs at least 3 chars unless it's a delimiter
      * char! */
@@ -865,8 +858,7 @@ TValue ktok_read_special(klisp_State *K)
 	int ch_i = ktok_getc(K);
 	if (ch_i == EOF) {
 	    ktok_error(K, "EOF found while reading character name");
-	    /* avoid warning */
-	    return KINERT;
+	    return KINERT; /* avoid warning */
 	}
 	ks_tbclear(K);
 	return ch2tv((char)ch_i);
@@ -879,7 +871,7 @@ TValue ktok_read_special(klisp_State *K)
 	** Kernel report (R-1RK))
 	** For now we follow the scheme report 
 	*/
-	char ch = buf[2];
+	char ch = buf[2]; /* we know buf_len > 2 */
 
 	if (ch < 0 || ch > 127) {
 	    ktok_error(K, "Non ASCII char found as character constant");
@@ -897,15 +889,20 @@ TValue ktok_read_special(klisp_State *K)
 	/* fall through */
     }
 
-    /* we ignore case in all remaining comparisons */
-    for(char *str2 = buf; *str2 != '\0'; str2++)
+    /* first save the third char, in case it's an hex escaped char
+       (that should be a lowercase x)  */
+    char saved_third = buf[2]; /* there's at least 2 chars, so in the worst
+				  case buf[2] is just '\0' */
+
+    /* now, we ignore case in all remaining comparisons */
+    size_t i = 0;
+    for(char *str2 = buf; i < buf_len; ++str2, ++i)
 	*str2 = tolower(*str2);
 
     /* REFACTOR: move this to a new function */
     /* then check the known constants (including named characters) */
     size_t stok_size = sizeof(kspecial_tokens) / 
 	sizeof(struct kspecial_token);
-    size_t i;
     for (i = 0; i < stok_size; i++) {
 	struct kspecial_token token = kspecial_tokens[i];
 	/* NOTE: must check type because buf may contain embedded '\0's */
@@ -920,12 +917,13 @@ TValue ktok_read_special(klisp_State *K)
        token or a character escape */
 
     if (buf[1] == '\\') { /* this is to have a meaningful error msg */
-	if (buf[2] != 'x') {/* this will also accept 'X' */
+	if (saved_third != 'x') { /* case is significant here, so
+				   we use the saved char */
 	    ktok_error(K, "Unrecognized character name");
 	    return KINERT;
 	}
-	/* We already checked that length != 3, so there's at least on
-	   more char */
+	/* We already checked that length != 3 (x is alphabetic), 
+	   so there's at least on more char */
 	TValue n;
 	char *end;
 
@@ -1057,26 +1055,50 @@ TValue ktok_read_special(klisp_State *K)
 }
 
 /*
-** Identifiers
+** Identifiers (and dot token)
 */
-TValue ktok_read_identifier(klisp_State *K)
+TValue ktok_read_identifier_or_dot(klisp_State *K)
 {
-    int32_t i = 1;
+    bool seen_dot = false;
+    int32_t i = 0;
     while (!ktok_check_delimiter(K)) {
 	/* NOTE: can't be eof, because eof is a delimiter */
 	char ch = (char) ktok_getc(K);
-
+	/* this is needed to differentiate a dot from an equivalent escape */
+	seen_dot |= ch == '.';
 	/* NOTE: is_subsequent of '\0' is false, so no embedded '\0' */
 	if (ktok_is_subsequent(ch)) {
+	    /* downcase all non-escaped chars */
+	    ks_tbadd(K, tolower(ch));
+	    ++i;
+	} else if (ch == '\\') {
+	    /* should be inline hex escape */
+	    ch = ktok_getc(K);
+	    if (ch == EOF) {
+		ktok_error(K, "EOF found while reading character name");
+	    } else if (ch != 'x') {
+		ktok_error_extra(K, "Invalid char in identifier after \\", 
+				 ch2tv((char)ch));
+	    }
+	    ch = ktok_read_hex_escape(K);
+	    /* don't downcase escaped chars */
 	    ks_tbadd(K, ch);
-	    i++;
-	} else
-	    ktok_error(K, "Invalid char in identifier");	    
+	    ++i;
+	} else {
+	    ktok_error_extra(K, "Invalid char in identifier", 
+			     ch2tv((char)ch));
+	}
+    }
+
+    if (i == 1 && seen_dot) {
+	ks_tbclear(K);
+	return K->ktok_dot;
     }
+
     ks_tbadd(K, '\0');
     TValue si = ktok_get_source_info(K);
     krooted_tvs_push(K, si); /* will be popped by throw */
-    TValue new_sym = ksymbol_new_i(K, ks_tbget_buffer(K), i-1, si);
+    TValue new_sym = ksymbol_new_bs(K, ks_tbget_buffer(K), i, si);
     krooted_tvs_pop(K); /* already in symbol */
     krooted_tvs_push(K, new_sym);
     ks_tbclear(K); /* this shouldn't cause gc, but just in case */
@@ -1084,4 +1106,63 @@ TValue ktok_read_identifier(klisp_State *K)
     return new_sym;
 }
 
+TValue ktok_read_bar_identifier(klisp_State *K)
+{
+    /* discard opening bar */
+    ktok_getc(K);
+
+    bool done = false;
+    int i = 0;
+
+    /* Never downcase chars in |...| escaped symbols */
+    while(!done) {
+	int ch = ktok_getc(K);
+	if (ch == EOF) {
+	    ktok_error(K, "EOF found while reading an |identifier|");
+	    return KINERT; /* avoid warning */
+	} else 	if (ch < 0 || ch > 127) {
+	    ktok_error(K, "Non ASCII char found while reading an identifier");
+	    return KINERT; /* avoid warning */
+	}
+
+	if (ch == '|') {
+	    ks_tbadd(K, '\0');
+	    done = true;
+	} else if (ch == '\\') {
+	    ch = ktok_getc(K);
+	
+	    if (ch == EOF) {
+		ktok_error(K, "EOF found while reading an |identifier|");
+		return KINERT; /* avoid warning */
+	    }
+
+	    switch(ch) {
+		/* These two will self insert */
+	    case '|':
+	    case '\\':
+		break;
+	    case 'x':
+		ch = ktok_read_hex_escape(K);
+		break;
+	    default:
+		ktok_error_extra(K, "Invalid char after '\\' " 
+				 "while reading a symbol", ch2tv(ch));
+		return KINERT; /* avoid warning */
+	    }
+	    ks_tbadd(K, ch);
+	    ++i;
+	} else { 
+	    ks_tbadd(K, ch);
+	    ++i;
+	}
+    }
+    TValue si = ktok_get_source_info(K);
+    krooted_tvs_push(K, si); /* will be popped by throw */
+    TValue new_sym = ksymbol_new_bs(K, ks_tbget_buffer(K), i, si);
+    krooted_tvs_pop(K); /* already in symbol */
+    krooted_tvs_push(K, new_sym);
+    ks_tbclear(K); /* this shouldn't cause gc, but just in case */
+    krooted_tvs_pop(K);
+    return new_sym;
+}
 
diff --git a/src/ktoken.h b/src/ktoken.h
@@ -35,10 +35,9 @@ inline int ktok_peekc(klisp_State *K) { return ktok_peekc_getc(K, true); }
 /* needed by the repl */
 void ktok_ignore_whitespace(klisp_State *K);
 
-/* This is needed for string->symbol to check if a symbol has external
+/* This is needed for kwrite to check if a symbol has external
    representation as an identifier */
 /* REFACTOR: think out a better interface to all this */
-
 /*
 ** Char set contains macro interface
 */
@@ -49,7 +48,8 @@ void ktok_ignore_whitespace(klisp_State *K);
 typedef uint32_t kcharset[8];
 
 extern kcharset ktok_alphabetic, ktok_numeric, ktok_whitespace;
-extern kcharset ktok_delimiter, ktok_extended, ktok_subsequent;
+extern kcharset ktok_delimiter, ktok_extended;
+extern kcharset ktok_subsequent, ktok_initial;
 
 #define ktok_is_alphabetic(chi_) kcharset_contains(ktok_alphabetic, chi_)
 #define ktok_is_numeric(chi_) kcharset_contains(ktok_numeric, chi_)
@@ -57,6 +57,7 @@ extern kcharset ktok_delimiter, ktok_extended, ktok_subsequent;
 #define ktok_is_whitespace(chi_) kcharset_contains(ktok_whitespace, chi_)
 #define ktok_is_delimiter(chi_) ((chi_) == EOF ||			\
 				 kcharset_contains(ktok_delimiter, chi_))
+#define ktok_is_initial(chi_) kcharset_contains(ktok_initial, chi_)
 #define ktok_is_subsequent(chi_) kcharset_contains(ktok_subsequent, chi_)
 
 #define kcharset_contains(kch_, ch_) \
diff --git a/src/kwrite.c b/src/kwrite.c
@@ -10,6 +10,7 @@
 #include <assert.h>
 #include <inttypes.h>
 #include <string.h>
+#include <ctype.h>
 
 #include "kwrite.h"
 #include "kobject.h"
@@ -26,6 +27,7 @@
 #include "kenvironment.h"
 #include "kbytevector.h"
 #include "kvector.h"
+#include "ktoken.h" /* for identifier checking */
 
 /*
 ** Stack for the write FSM
@@ -194,8 +196,8 @@ void kw_print_string(klisp_State *K, TValue str)
 			   (!K->write_displayp && 
 			    (*ptr == '\\' || *ptr == '"')));
     	      ++i, ptr++) {
-	    /* This are all ASCII printable characters + space, except \ and
-	       " if !displayp */
+	    /* This are all ASCII printable characters (including space,
+	       and exceptuating '\' and '"' if !displayp) */
 	    char *fmt;
 	    /* must be uint32_t to support all unicode chars
 	     in the future */
@@ -235,6 +237,95 @@ void kw_print_string(klisp_State *K, TValue str)
 }
 
 /*
+** Helper for printing symbols.
+** If symbol is not a regular identifier it
+** uses the "|...|" syntax, escaping '|', '\' and 
+** non printing characters.
+*/
+void kw_print_symbol(klisp_State *K, TValue sym)
+{
+    uint32_t size = ksymbol_size(sym);
+    char *buf = ksymbol_buf(sym);
+
+    /* first determine if it's a simple identifier */
+    bool identifierp;
+    if (size == 0)
+	identifierp = false;
+    else if (size == 1 && *buf == '.')
+	identifierp = false;
+    else if (size == 1 && (*buf == '+' || *buf == '-'))
+	identifierp = true;
+    else if (*buf == tolower(*buf) && ktok_is_initial(*buf)) {
+	char *ptr = buf;
+	uint32_t i = 0;
+	identifierp = true;
+	while (identifierp && i < size) {
+	    char ch = *ptr++;
+	    ++i;
+	    if (tolower(ch) != ch || !ktok_is_subsequent(ch))
+		identifierp = false;
+	}
+    } else
+	identifierp = false;
+
+    if (identifierp) {
+	/* no problem, just a simple string */
+	kw_printf(K, "%s", buf);
+	return;
+    } 
+
+    /*
+    ** In case we get here, we'll have to use the "|...|" syntax
+    */
+    char *ptr = buf;
+    int i = 0;
+
+    kw_printf(K, "|");
+
+    while (i < size) {
+	/* find the longest printf-able substring to avoid calling printf
+	 for every char */
+	for (ptr = buf; 
+	     i < size && *ptr != '\0' &&
+		 (*ptr >= 32 && *ptr < 127) &&
+		 (*ptr != '\\' && *ptr != '|'); 
+	     i++, ptr++)
+	    ;
+
+	/* NOTE: this work even if ptr == buf (which can only happen the 
+	 first or last time) */
+	char ch = *ptr;
+	*ptr = '\0';
+	kw_printf(K, "%s", buf);
+	*ptr = ch;
+
+	for(; i < size && (*ptr == '\0' || (*ptr < 32 || *ptr >= 127) ||
+			   (*ptr == '\\' || *ptr == '|'));
+	      ++i, ptr++) {
+	    /* This are all ASCII printable characters (including space,
+	       and exceptuating '\' and '|') */
+	    char *fmt;
+	    /* must be uint32_t to support all unicode chars
+	     in the future */
+	    uint32_t arg;
+	    ch = *ptr;
+	    switch(*ptr) {
+		/* regular \ escapes */
+	    case '|': fmt = "\\%c"; arg = (uint32_t) '|'; break;
+	    case '\\': fmt = "\\%c"; arg = (uint32_t) '\\'; break;
+		/* for the rest of the non printable chars, 
+		   use hex escape */
+	    default: fmt = "\\x%x;"; arg = (uint32_t) ch; break;
+	    }
+	    kw_printf(K, fmt, arg);
+	}
+	buf = ptr;
+    }
+			
+    kw_printf(K, "|");
+}
+
+/*
 ** Mark initialization and clearing
 */
 /* GC: root is rooted */
@@ -305,7 +396,8 @@ void kw_set_initial_marks(klisp_State *K, TValue root)
 #if KTRACK_NAMES
 void kw_print_name(klisp_State *K, TValue obj)
 {
-    kw_printf(K, ": %s", ksymbol_buf(kget_name(K, obj)));
+    kw_printf(K, ": ");
+    kw_print_symbol(K, kget_name(K, obj));
 }
 #endif /* KTRACK_NAMES */
 
@@ -471,12 +563,7 @@ void kwrite_scalar(klisp_State *K, TValue obj)
 	kw_printf(K, "#%c", bvalue(obj)? 't' : 'f');
 	break;
     case K_TSYMBOL:
-	if (khas_ext_rep(obj)) {
-	    /* TEMP: access symbol structure directly */
-	    kw_printf(K, "%s", ksymbol_buf(obj));
-	} else {
-	    kw_printf(K, "#[symbol]");
-	}
+	kw_print_symbol(K, obj);
 	break;
     case K_TINERT:
 	kw_printf(K, "#inert");

	klisp an open source interpreter for the Kernel Programming Language.
	git clone http://git.hanabi.in/repos/klisp.git
	Log \| Files \| Refs \| README

M	TODO	\|	3	+--
M	src/Makefile	\|	4	++--
M	src/kghelpers.h	\|	10	+++++-----
M	src/kgsymbols.c	\|	2	+-
M	src/klisp.c	\|	4	++--
M	src/kobject.h	\|	5	-----
M	src/kstate.c	\|	4	++--
M	src/ksymbol.c	\|	69	++++++++++++++-------------------------------------------------------
M	src/ksymbol.h	\|	14	++++++++------
M	src/ktoken.c	\|	305	++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------
M	src/ktoken.h	\|	7	++++---
M	src/kwrite.c	\|	105	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------