Added string escapes (new named chars, hex escapes and escaped newlines), both to reading and writing. Modified display to only show newlines and tabs, and show all the other non printing chars as spaces. Refactored ktoken.c a little. - klisp - an open source interpreter for the Kernel Programming Language.

commit 32090a4e18f1f878ff339afdd5b83261a8c1930c
parent ec54b3d633a47512af7075ebf4ac4f9da677ca01
Author: Andres Navarro <canavarro82@gmail.com>
Date:   Tue, 29 Nov 2011 23:48:47 -0300

Added string escapes (new named chars, hex escapes and escaped newlines), both to reading and writing. Modified display to only show newlines and tabs, and show all the other non printing chars as spaces. Refactored ktoken.c a little.

Diffstat:
M TODO  | 1 -
M src/kobject.c  | 1 +
M src/kobject.h  | 2 +-
M src/ktoken.c  | 185 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
M src/ktoken.h  | 17 +++++------------
M src/kwrite.c  | 51 +++++++++++++++++++++++++++++++++++++++++----------

6 files changed, 197 insertions(+), 60 deletions(-)
diff --git a/TODO b/TODO
@@ -40,7 +40,6 @@
 ** string->number (r7rs)
 * reader
 ** symbol escapes (r7rs)
-** string escapes (r7rs)
 ** syntax support for complex numbers (Kernel report)
 * other
 ** optional argument to member? (r7rs)
diff --git a/src/kobject.c b/src/kobject.c
@@ -36,6 +36,7 @@ const TValue kescape = KESCAPE_;
 const TValue kspace = KSPACE_;
 const TValue kdelete = KDELETE_;
 const TValue kvtab = KVTAB_;
+const TValue kformfeed = KFORMFEED_;
 
 /*
 ** The name strings for all TValue types
diff --git a/src/kobject.h b/src/kobject.h
@@ -623,7 +623,7 @@ union GCObject {
 #define KSPACE_ {.tv = {.t = K_TAG_CHAR, .v = { .ch = ' ' }}}
 #define KDELETE_ {.tv = {.t = K_TAG_CHAR, .v = { .ch = '\x7f' }}}
 #define KVTAB_ {.tv = {.t = K_TAG_CHAR, .v = { .ch = '\v' }}}
-
+#define KFORMFEED_ {.tv = {.t = K_TAG_CHAR, .v = { .ch = '\f' }}}
 
 /* RATIONALE: the ones above can be used in initializers */
 #define KNIL ((TValue) KNIL_)
diff --git a/src/ktoken.c b/src/ktoken.c
@@ -112,6 +112,7 @@ void ktok_init(klisp_State *K)
     kcharset_fill(ktok_alphabetic, "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 		  "abcdefghijklmnopqrstuvwxyz");
     kcharset_fill(ktok_numeric, "0123456789");
+    /* keep synchronized with cases in main tokenizer switch */
     kcharset_fill(ktok_whitespace, " \t\v\r\n\f");
 
     kcharset_fill(ktok_delimiter, "()\";");
@@ -330,8 +331,6 @@ TValue ktok_read_token(klisp_State *K)
     klisp_assert(ks_tbisempty(K));
 
     while(true) {
-	ktok_ignore_whitespace(K);
-
 	/* save the source info in case a token starts here */
 	ktok_save_source_info(K);
 
@@ -341,6 +340,14 @@ TValue ktok_read_token(klisp_State *K)
 	case EOF:
 	    ktok_getc(K);
 	    return KEOF;
+	case ' ':
+	case '\n':
+	case '\r':
+	case '\t':
+	case '\v': 
+	case '\f': /* Keep synchronized with whitespace chars */
+	    ktok_ignore_whitespace(K);
+	    continue;
 	case ';':
 	    ktok_ignore_single_line_comment(K);
 	    continue;
@@ -365,7 +372,6 @@ TValue ktok_read_token(klisp_State *K)
 	case '#': {
 	    ktok_getc(K);
 	    chi = ktok_peekc(K);
-
 	    switch(chi) {
 	    case EOF:
 		ktok_error(K, "# constant is too short");
@@ -431,7 +437,8 @@ TValue ktok_read_token(klisp_State *K)
 	    chi = ktok_getc(K);
 	    /* TODO add char to error */
 	unrecognized_error:
-	    ktok_error_extra(K, "unrecognized token starting char", ch2tv((char) chi));
+	    ktok_error_extra(K, "unrecognized token starting char", 
+			     ch2tv((char) chi));
 	    /* avoid warning */
 	    return KINERT;
 	}
@@ -653,42 +660,151 @@ TValue ktok_read_string(klisp_State *K)
     int i = 0;
 
     while(!done) {
-	int chi = ktok_getc(K);
-	char ch = (char) chi;
-
-	if (chi == EOF) {
+	int ch = ktok_getc(K);
+    just_read: /* this comes from escaped newline */
+	if (ch == EOF) {
 	    ktok_error(K, "EOF found while reading a string");
-	    /* avoid warning */
-	    return KINERT;
-	}
-	if (ch < 0 || ch > 127) {
+	    return KINERT; /* avoid warning */
+	} else 	if (ch < 0 || ch > 127) {
 	    ktok_error(K, "Non ASCII char found while reading a string");
-	    /* avoid warning */
-	    return KINERT;
-	} else if (ch == '"') {
+	    return KINERT; /* avoid warning */
+	}
+
+
+	if (ch == '"') {
 	    ks_tbadd(K, '\0');
 	    done = true;
-	} else {
-	    if (ch == '\\') {
-		chi = ktok_getc(K);
+	} else if (ch == '\\') {
+	    ch = ktok_getc(K);
 	
-		if (chi == EOF) {
+	    if (ch == EOF) {
+		ktok_error(K, "EOF found while reading a string");
+		return KINERT; /* avoid warning */
+	    }
+
+	    switch(ch) {
+		/* These two will self insert */
+	    case '"':
+	    case '\\':
+		break;
+		/* These are naming chars (like in c, mostly) */
+	    case '0':
+		ch = '\0';
+		break;
+	    case 'a':
+		ch = '\a';
+		break;
+	    case 'b':
+		ch = '\b';
+		break;
+	    case 't':
+		ch = '\t';
+		break;
+	    case 'n':
+		ch = '\n';
+		break;
+	    case 'r':
+		ch = '\r';
+		break;
+	    case 'v':
+		ch = '\v';
+		break;
+	    case 'f':
+		ch = '\f';
+		break;
+		/* 
+		** These signal an escaped newline (not included in string)
+		*/
+	    case ' ':
+	    case '\t':
+		/* eat up all intraline spacing */
+		while((ch = ktok_getc(K)) != EOF &&
+		      (ch == ' ' || ch == '\t'))
+		    ;
+		if (ch == EOF) {
 		    ktok_error(K, "EOF found while reading a string");
-		    /* avoid warning */
-		    return KINERT;
+		    return KINERT; /* avoid warning */
+		} else if (ch != '\n' && ch != '\r') {
+		    ktok_error(K, "Invalid char found after \\ while "
+			       "reading a string");
+		    return KINERT; /* avoid warning */
 		}
-
-		ch = (char) chi;
-
-		if (ch != '\\' && ch != '"') {
-		    ktok_error(K, "Invalid char after '\\' " 
+		/* fall through */
+	    case '\n': 
+	    case '\r':
+		/* use the r6rs definition for line end */
+		if (ch == 'r') {
+		    ch = ktok_peekc(K);
+		    if (ch != EOF && ch == '\n')
+			ktok_getc(K);
+		}
+		/* eat up all intraline spacing */
+		while((ch = ktok_getc(K)) != EOF &&
+		      (ch == ' ' || ch == '\t'))
+		    ;
+		/* this will check for EOF and continue reading the 
+		   string at the top of the loop */
+		goto just_read;
+		/* This is an hex escaped char */
+	    case 'x': {	   
+		/* enough space for any unicode char + 2 */
+		char buf[10];
+		int c = 0;
+		bool at_least_onep = false;
+		for(ch = ktok_getc(K); ch != EOF && ch != ';'; 
+		      ch = ktok_getc(K)) {
+		    if (!ktok_is_digit(ch, 16)) {
+			ktok_error_extra(K, "Invalid char found in string "
+					 "hex escape", ch2tv(ch));
+			return KINERT; /* avoid warning */
+		    } 
+		    /* 
+		    ** This will allow one space for '\0' and one extra
+		    ** char in case the value is too big, and so will 
+		    ** naturally result in a value outside the unicode
+		    ** range without the need to record any extra 
+		    ** characters other than the first 8 (without 
+		    ** leading zeroes).
+		    */
+		    at_least_onep = true;
+		    if (c < sizeof(buf) - 1 && (c > 0 || ch != '0')) 
+			buf[c++] = ch;
+		}
+		if (ch == EOF) {
+		    ktok_error(K, "EOF found while reading a string");
+		    return KINERT; /* avoid warning */
+		} else if (!at_least_onep) {
+		    ktok_error(K, "Empty hex escape found while reading "
+			       "a string");
+		    return KINERT; /* avoid warning */
+		} else if (c == 0) { /* this is the case of a NULL char */
+		    buf[c++] = '0'; 
+		}
+		buf[c++] = '\0';
+		/* buf now contains the hex value of the char */
+		TValue n;
+		int res = kinteger_read(K, buf, 16, &n, NULL);
+		/* can't fail, all digits were checked already */
+		klisp_assert(res == true);
+		if (!ttisfixint(n) || ivalue(n) > 127) {
+		    ktok_error(K, "hex escaped char out of ASCII range "
 			       "while reading a string");
-		    /* avoid warning */
-		    return KINERT;
+		    return KINERT; /* avoid warning */
 		}
-	    } 
+		/* all ok, we pass the char */
+		ch = (char) ivalue(n);
+		break;
+	    }
+	    default:
+		ktok_error_extra(K, "Invalid char after '\\' " 
+				 "while reading a string", ch2tv(ch));
+		return KINERT; /* avoid warning */
+	    }
 	    ks_tbadd(K, ch);
-	    i++;
+	    ++i;
+	} else { 
+	    ks_tbadd(K, ch);
+	    ++i;
 	}
     }
     /* TEMP: for now strings "read" are mutable but strings "loaded" are
@@ -722,7 +838,7 @@ struct kspecial_token {
 			{ "#undefined", KUNDEF_ },
 			/* 
 			** Character names 
-			** (r7rs + vtab and linefeed from r6rs) 
+			** (r7rs + vtab from r6rs) 
 			*/
 			{ "#\\null", KNULL_ },
 			{ "#\\alarm", KALARM_ },
@@ -733,11 +849,8 @@ struct kspecial_token {
 			{ "#\\escape", KESCAPE_ },
 			{ "#\\space", KSPACE_ }, /* kernel */
 			{ "#\\delete", KDELETE_ },
-			/* r6rs, only */
-			{ "#\\vtab", KVTAB_ }, 
-			{ "#\\linefeed", KNEWLINE_ }, /* same as \newline */
-			{ "#\\esc", KESCAPE_ }, /* same as r7rs \escape */
-			{ "#\\nul", KNULL_ } /* same as r7rs \NULL */
+			{ "#\\vtab", KVTAB_ }, /* r6rs, only */
+			{ "#\\formfeed", KFORMFEED_ } /* r6rs in strings */
  }; 
 
 #define MAX_EXT_REP_SIZE 64  /* all special tokens have less than 64 chars */
diff --git a/src/ktoken.h b/src/ktoken.h
@@ -7,11 +7,12 @@
 #ifndef ktoken_h
 #define ktoken_h
 
+#include <stdio.h>
+#include <ctype.h>
+
 #include "kobject.h"
 #include "kstate.h"
 
-#include <stdio.h>
-
 /*
 ** Tokenizer interface
 */
@@ -63,25 +64,17 @@ extern kcharset ktok_delimiter, ktok_extended, ktok_subsequent;
 	kch_[KCHS_OCTANT(ch__)] & KCHS_BIT(ch__); })
 
 
-/* NOTE: only lowercase chars for hexa */
 inline bool ktok_is_digit(char ch, int32_t radix)
 {
+    ch = tolower(ch);
     return (ktok_is_numeric(ch) && (ch - '0') < radix) ||
 	(ktok_is_alphabetic(ch) && (10 + (ch - 'a')) < radix);
 }
 
 inline int32_t ktok_digit_value(char ch)
 {
+    ch = tolower(ch);
     return (ch <= '9')? ch - '0' : 10 + (ch - 'a');
 }
 
-/* This takes the args in sign magnitude form (sign & res),
-   but must work for any representation of negative numbers */
-inline bool can_add_digit(uint32_t res, bool sign, uint32_t new_digit, 
-			  int32_t radix)
-{
-    return (sign)? res <= -(INT32_MIN + new_digit) / radix :
-	res <= (INT32_MAX - new_digit) / radix;
-}
-
 #endif
diff --git a/src/kwrite.c b/src/kwrite.c
@@ -156,9 +156,12 @@ void kw_print_double(klisp_State *K, TValue tv_double)
 }
 
 /*
-** Helper for printing strings (correcly escapes backslashes and
-** double quotes & prints embedded '\0's). It includes the surrounding
-** double quotes.
+** Helper for printing strings.
+** If !displayp it prints the surrounding double quotes
+** and escapes backslashes, double quotes,
+** and non printable chars (including NULL). 
+** if displayp it doesn't include surrounding quotes and just
+** converts non-printable characters to spaces
 */
 void kw_print_string(klisp_State *K, TValue str)
 {
@@ -175,6 +178,7 @@ void kw_print_string(klisp_State *K, TValue str)
 	 for every char */
 	for (ptr = buf; 
 	     i < size && *ptr != '\0' &&
+		 (*ptr >= 32 && *ptr < 127) &&
 		 (K->write_displayp || (*ptr != '\\' && *ptr != '"')); 
 	     i++, ptr++)
 	    ;
@@ -186,15 +190,42 @@ void kw_print_string(klisp_State *K, TValue str)
 	kw_printf(K, "%s", buf);
 	*ptr = ch;
 
-	while(i < size && (*ptr == '\0' || 
-        	(!K->write_displayp && (*ptr == '\\' || *ptr == '"')))) {
-	    if (*ptr == '\0') {
-		kw_printf(K, "%c", '\0'); /* this may not show in the terminal */
+	for(; i < size && (*ptr == '\0' || (*ptr < 32 || *ptr >= 127) ||
+			   (!K->write_displayp && 
+			    (*ptr == '\\' || *ptr == '"')));
+    	      ++i, ptr++) {
+	    /* This are all ASCII printable characters + space, except \ and
+	       " if !displayp */
+	    char *fmt;
+	    /* must be uint32_t to support all unicode chars
+	     in the future */
+	    uint32_t arg;
+	    ch = *ptr;
+	    if (K->write_displayp) {
+		fmt = "%c";
+		/* in display only show tabs and newlines, 
+		 all other non printables are shown as spaces */
+		arg = (uint32_t) ((ch == '\r' || ch == '\n' || ch == '\t')? 
+				  ch : ' ');
 	    } else {
-		kw_printf(K, "\\%c", *ptr);
+		switch(*ptr) {
+		    /* regular \ escapes */
+		case '\"': fmt = "\\%c"; arg = (uint32_t) '"'; break;
+		case '\\': fmt = "\\%c"; arg = (uint32_t) '\\'; break;
+		case '\0': fmt = "\\%c"; arg = (uint32_t) '0'; break;
+		case '\a': fmt = "\\%c"; arg = (uint32_t) 'a'; break;
+		case '\b': fmt = "\\%c"; arg = (uint32_t) 'b'; break;
+		case '\t': fmt = "\\%c"; arg = (uint32_t) 't'; break;
+		case '\n': fmt = "\\%c"; arg = (uint32_t) 'n'; break;
+		case '\r': fmt = "\\%c"; arg = (uint32_t) 'r'; break;
+		case '\v': fmt = "\\%c"; arg = (uint32_t) 'v'; break;
+		case '\f': fmt = "\\%c"; arg = (uint32_t) 'f'; break;
+		    /* for the rest of the non printable chars, 
+		       use hex escape */
+		default: fmt = "\\x%x;"; arg = (uint32_t) ch; break;
+		}
 	    }
-	    i++;
-	    ptr++;
+	    kw_printf(K, fmt, arg);
 	}
 	buf = ptr;
     }

	klisp an open source interpreter for the Kernel Programming Language.
	git clone http://git.hanabi.in/repos/klisp.git
	Log \| Files \| Refs \| README

M	TODO	\|	1	-
M	src/kobject.c	\|	1	+
M	src/kobject.h	\|	2	+-
M	src/ktoken.c	\|	185	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
M	src/ktoken.h	\|	17	+++++------------
M	src/kwrite.c	\|	51	+++++++++++++++++++++++++++++++++++++++++----------