Some heavy refactoring in ktoken concerning handling of special constants. - klisp - an open source interpreter for the Kernel Programming Language.

commit 732af539972d2d5f1ebf9f01af2962841871bd4d
parent e6b2a266cce8346dfd2ae48d84440eef7e37fb14
Author: Andres Navarro <canavarro82@gmail.com>
Date:   Wed, 13 Apr 2011 16:24:49 -0300

Some heavy refactoring in ktoken concerning handling of special constants.

Diffstat:
M src/kgports.c  | 2 +-
M src/kobject.c  | 2 ++
M src/kobject.h  | 7 +++++++
M src/krepl.c  | 2 +-
M src/ktoken.c  | 254 ++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
M src/kwrite.c  | 2 +-
M src/kwrite.h  | 2 +-

7 files changed, 161 insertions(+), 110 deletions(-)
diff --git a/src/kgports.c b/src/kgports.c
@@ -211,7 +211,7 @@ void newline(klisp_State *K, TValue *xparams, TValue ptree, TValue denv)
     /* TEMP: for now set this by hand */
     K->curr_out = kport_file(port);
 
-    knewline(K);
+    kwrite_newline(K);
     kapply_cc(K, KINERT);
 }
 
diff --git a/src/kobject.c b/src/kobject.c
@@ -20,6 +20,8 @@ const TValue ktrue = KTRUE_;
 const TValue kfalse = KFALSE_;
 const TValue kepinf = KEPINF_;
 const TValue keminf = KEMINF_;
+const TValue kspace = KSPACE_;
+const TValue knewline = KNEWLINE_;
 
 /*
 ** The name strings for all TValue types
diff --git a/src/kobject.h b/src/kobject.h
@@ -408,6 +408,9 @@ union GCObject {
 #define KFALSE_ {.tv = {.t = K_TAG_BOOLEAN, .v = { .b = false }}}
 #define KEPINF_ {.tv = {.t = K_TAG_EINF, .v = { .i = 1 }}}
 #define KEMINF_ {.tv = {.t = K_TAG_EINF, .v = { .i = -1 }}}
+#define KSPACE_ {.tv = {.t = K_TAG_CHAR, .v = { .ch = ' ' }}}
+#define KNEWLINE_ {.tv = {.t = K_TAG_CHAR, .v = { .ch = '\n' }}}
+
 
 /* RATIONALE: the ones above can be used in initializers */
 #define KNIL ((TValue) KNIL_)
@@ -418,6 +421,8 @@ union GCObject {
 #define KFALSE ((TValue) KFALSE_)
 #define KEPINF ((TValue) KEPINF_)
 #define KEMINF ((TValue) KEMINF_)
+#define KSPACE ((TValue) KSPACE_)
+#define KNEWLINE ((TValue) KNEWLINE_)
 
 /* The same constants as global const variables */
 const TValue knil;
@@ -428,6 +433,8 @@ const TValue ktrue;
 const TValue kfalse;
 const TValue kepinf;
 const TValue keminf;
+const TValue kspace;
+const TValue knewline;
 
 /* Macros to create TValues of non-heap allocated types (for initializers) */
 #define ch2tv_(ch_) {.tv = {.t = K_TAG_CHAR, .v = { .ch = (ch_) }}}
diff --git a/src/krepl.c b/src/krepl.c
@@ -92,7 +92,7 @@ void loop_fn(klisp_State *K, TValue *xparams, TValue obj)
     K->curr_out = stdout;
     K->write_displayp = false;
     kwrite(K, obj);
-    knewline(K);
+    kwrite_newline(K);
     TValue denv = xparams[0];
     create_loop(K, denv);
 } 
diff --git a/src/ktoken.c b/src/ktoken.c
@@ -350,7 +350,7 @@ bool ktok_check_delimiter(klisp_State *K)
 /*
 ** Returns the number of bytes read
 */
-int ktok_read_until_delimiter(klisp_State *K)
+int32_t ktok_read_until_delimiter(klisp_State *K)
 {
     int i = 0;
 
@@ -501,74 +501,51 @@ TValue ktok_read_string(klisp_State *K)
 ** (Special number syntax, char constants, #ignore, #inert, srfi-38 tokens) 
 */
 
-/* TODO always read till delimiter, then process the array instead
-   of reading one by one. Do the same for numbers, use the IMath functions 
-   even */
+/* this include the named chars as a subcase */
+struct kspecial_token {
+    const char *ext_rep; /* downcase external representation */
+    TValue obj;
+} kspecial_tokens[] = { { "#t", KTRUE_ },
+			{ "#f", KFALSE_ },
+			{ "#ignore", KIGNORE_ },
+			{ "#inert", KINERT_ },
+			{ "#e+infinity", KEPINF_ },
+			{ "#e-infinity", KEMINF_ },
+			/* TODO add undefined, real with on primary value,
+			   and inexact infinities */
+			{ "#\\space", KSPACE_ },
+			{ "#\\newline", KNEWLINE_ }
+ }; 
+
+#define MAX_EXT_REP_SIZE 64  /* all special tokens have much than 64 chars */
+
 TValue ktok_read_special(klisp_State *K)
 {
-    /* discard the '#' */
-    ktok_getc(K);
-    
-    int chi = ktok_getc(K);
-    char ch = (char) chi;
+    /* the # is still pending (was only peeked) */
+    int32_t buf_len = ktok_read_until_delimiter(K);
+    char *buf = ks_tbget_buffer(K);
 
-    if (chi == EOF) {
-	ktok_error(K, "EOF found while reading a '#' constant");
+    if (buf_len < 2) {
+	/* we need at least one char in addition to the '#' */
+	ktok_error(K, "# constant is too short");
 	/* avoid warning */
 	return KINERT;
     }
 
-    switch(ch) {
-    case 'i': {
-	/* ignore or inert */
-	/* XXX: could also be an inexact number */
-	ktok_read_until_delimiter(K);
-	/* NOTE: can use strcmp even in the presence of '\0's */
-	TValue ret_val;
-	char *buf = ks_tbget_buffer(K);
-	if (strcmp(buf, "gnore") == 0)
-	    ret_val = KIGNORE;
-	else if (strcmp(buf, "nert") == 0)
-	    ret_val =  KINERT;
-	else {
-	    ktok_error(K, "unexpected char in # constant");
-	    /* avoid warning */
-	    return KINERT;
-	}
-	ks_tbclear(K);
-	return ret_val;
-    }
-    case 'e': {
-	/* an exact infinity */
-	/* XXX: could also be an exact number */
-	ktok_read_until_delimiter(K);
-	TValue ret_val;
-	/* NOTE: can use strcmp even in the presence of '\0's */
-	char *buf = ks_tbget_buffer(K);
-	if (strcmp(buf, "+infinity") == 0) {
-	    ret_val = KEPINF;
-	} else if (strcmp(buf, "-infinity") == 0) {
-	    ret_val =  KEMINF;
-	} else {
-	    ktok_error(K, "unexpected char in # constant");
-	    /* avoid warning */
-	    return KINERT;
-	}
-	ks_tbclear(K);
-	return ret_val;
+    /* first check that is not an output only representation, 
+       they begin with '#[' and end with ']', but we know
+       that buf[0] == '#' */
+    if (buf_len > 2 && buf[1] == '[' && buf[buf_len-1] == ']') {
+	ktok_error(K, "output only representation found");
+	/* avoid warning */
+	return KINERT;
     }
-    case 't':
-    case 'f':
-	/* boolean constant */
-	if (ktok_check_delimiter(K))
-	    return b2tv(ch == 't');
-	else {
-	    ktok_error(K, "unexpected char in # constant");
-	    /* avoid warning */
-	    return KINERT;
-	}
-    case '\\': {
-	/* char constant */
+
+    /* Then check for simple chars, this is the only thing
+       that is case dependant, so after this we downcase buf */
+    /* REFACTOR: move this to a new function */
+    /* char constant, needs at least 3 chars  */
+    if (buf_len > 2 && buf[1] == '\\') {
 	/* 
 	** RATIONALE: in the scheme spec (R5RS) it says that only alphabetic 
 	** char constants need a delimiter to disambiguate the cases with 
@@ -577,15 +554,8 @@ TValue ktok_read_special(klisp_State *K)
 	** Kernel report (R-1RK))
 	** For now we follow the scheme report 
 	*/
-	chi = ktok_getc(K);
 
-	if (chi == EOF) {
-	    ktok_error(K, "EOF found while reading a char constant");
-	    /* avoid warning */
-	    return KINERT;
-	}
-
-	ch = (char) chi;
+	char ch = buf[2];
 
 	if (ch < 0 || ch > 127) {
 	    ktok_error(K, "Non ASCII char found as character constant");
@@ -593,36 +563,52 @@ TValue ktok_read_special(klisp_State *K)
 	    return KINERT;
 	} 
 
-	if (!ktok_is_alphabetic(ch) || ktok_check_delimiter(K))
+	if (!ktok_is_alphabetic(ch) || buf_len == 3) { /* simple char */
+	    ks_tbclear(K);
 	    return ch2tv(ch);
-
-	ktok_read_until_delimiter(K);
-	char *p = ks_tbget_buffer(K);
-	while (*p) {
-	    *p = tolower(*p);
-	    p++;
 	}
-	ch = tolower(ch);
-	/* NOTE: can use strcmp even in the presence of '\0's */
-	char *buf = ks_tbget_buffer(K);
-	if (ch == 's' && strcmp(buf, "pace") == 0)
-	    ch = ' ';
-	else if (ch == 'n' && strcmp(buf, "ewline") == 0)
-	    ch = ('\n');
-	else {
-	    ktok_error(K, "Unrecognized character name");
-	    /* avoid warning */
-	    return KINERT;
+
+	/* char names are a subcase of special tokens so this case
+	   will be handled later */
+	/* fall through */
+    }
+
+    /* we ignore case in all remaining comparisons */
+    for(char *str2 = buf; *str2 != '\0'; str2++)
+	*str2 = tolower(*str2);
+
+    /* REFACTOR: move this to a new function */
+    /* then check the known constants */
+    size_t stok_size = sizeof(kspecial_tokens) / 
+	sizeof(struct kspecial_token);
+    size_t i;
+    for (i = 0; i < stok_size; i++) {
+	struct kspecial_token token = kspecial_tokens[i];
+	/* NOTE: must check type because buf may contain embedded '\0's */
+	if (buf_len == strlen(token.ext_rep) &&
+	       strcmp(token.ext_rep, buf) == 0) {
+	    ks_tbclear(K);
+	    return token.obj; 
 	}
-	ks_tbclear(K);
-	return ch2tv(ch);
     }
-    case '0': case '1': case '2': case '3': case '4': 
-    case '5': case '6': case '7': case '8': case '9': {
+
+    if (buf[1] == '\\') { /* this is to have a meaningful error msg */
+	ktok_error(K, "Unrecognized character name");
+	/* avoid warning */
+	return KINERT;
+    }
+
+    /* REFACTOR: move this to a new function */
+    /* It was not a special token so it must be either a srfi-38 style
+       token, or a char constant or a number. srfi-38 tokens are a '#' a 
+       decimal number and end with a '=' or a '#' */
+    if (buf_len > 2 && ktok_is_numeric(buf[1])) {
 	/* srfi-38 type token (can be either a def or ref) */
 	/* IMPLEMENTATION RESTRICTION: only allow fixints in shared tokens */
 	int32_t res = 0;
-	while(ch != '#' && ch != '=') {
+	int32_t i = 1;
+	char ch = buf[i];
+	while(i < buf_len && ch != '#' && ch != '=') {
 	    if (!ktok_is_numeric(ch)) {
 		ktok_error(K, "Invalid char found in srfi-38 token");
 		/* avoid warning */
@@ -637,24 +623,80 @@ TValue ktok_read_special(klisp_State *K)
 		/* avoid warning */
 		return KINERT;
 	    }
-
-	    chi = ktok_getc(K);
-	    ch = (char) chi;
-	    
-	    if (chi == EOF) {
-		ktok_error(K, "EOF found while reading a srfi-38 token");
-		/* avoid warning */
-		return KINERT;
-	    }
+	    ch = buf[i++];
 	}
+	if (i == buf_len) {
+	    ktok_error(K, "Missing last char in srfi-38 token");
+	    return KINERT;
+	} /* else buf[i] == '#' or '=' */
+	ks_tbclear(K);
 	return kcons(K, ch2tv(ch), i2tv(res));
     }
-    /* TODO: add real with no primary value and undefined */
-    default:
-	ktok_error(K, "unexpected char in # constant");
-	/* avoid warning */
-	return KINERT;
+    
+    /* REFACTOR: move to new function */
+
+    /* the only possibility left is that it is a number with
+       an exactness or radix refix */
+    bool has_exactp = false;
+    bool exactp = false;  /* the default exactness will depend on the format */
+    bool has_radixp = false;
+    int32_t radix = 10;
+	
+    int32_t idx = 1;
+    while (idx < buf_len) {
+	char ch = buf[idx];
+	switch(ch) {
+	case 'i':
+	case 'e':
+	    if (has_exactp) {
+		ktok_error(K, "two exactness prefixes in number");
+		return KINERT;
+	    }
+	    has_exactp = true;
+	    exactp = (ch == 'e');
+	    break;
+	case 'b': radix = 2; goto RADIX;
+	case 'o': radix = 2; goto RADIX;
+	case 'd': radix = 2; goto RADIX;
+	case 'x': radix = 16; goto RADIX;
+	RADIX: 
+	    if (has_radixp) {
+		ktok_error(K, "two radix prefixes in number");
+		return KINERT;
+	    }
+	    has_radixp = true;
+	    break;
+	default:
+	    ktok_error(K, "unexpected char in number after #");
+	    /* avoid warning */
+	    return KINERT;
+	}
+
+	ch = buf[++idx];
+
+	switch(ch) {
+	case '#': {
+	    ++idx; /* get next exacness or radix prefix */
+	    break;
+	}
+	case '0': case '1': case '2': case '3': case '4':
+	case '5': case '6': case '7': case '8': case '9': 
+	case 'a': case 'b': case 'c': case 'd': case 'e':
+	case 'f': case '+': case '-': { /* read the number */
+		/* TODO */
+		ktok_error(K, "not supported");
+		return KINERT;
+	}
+	default:
+	    ktok_error(K, "unexpected char in number");
+	    /* avoid warning */
+	    return KINERT;
+	}
     }
+    /* this means that the number wasn't found after the prefixes */
+    ktok_error(K, "no digits found in number");
+    /* avoid warning */
+    return KINERT;
 }
 
 /*
diff --git a/src/kwrite.c b/src/kwrite.c
@@ -373,7 +373,7 @@ void kwrite(klisp_State *K, TValue obj)
     kw_clear_marks(K, obj);
 }
 
-void knewline(klisp_State *K)
+void kwrite_newline(klisp_State *K)
 {
     kw_printf(K, "\n");
     kw_flush(K);
diff --git a/src/kwrite.h b/src/kwrite.h
@@ -14,7 +14,7 @@
 ** Writer interface
 */
 void kwrite(klisp_State *K, TValue obj);
-void knewline(klisp_State *K);
+void kwrite_newline(klisp_State *K);
 
 #endif

	klisp an open source interpreter for the Kernel Programming Language.
	git clone http://git.hanabi.in/repos/klisp.git
	Log \| Files \| Refs \| README

M	src/kgports.c	\|	2	+-
M	src/kobject.c	\|	2	++
M	src/kobject.h	\|	7	+++++++
M	src/krepl.c	\|	2	+-
M	src/ktoken.c	\|	254	++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
M	src/kwrite.c	\|	2	+-
M	src/kwrite.h	\|	2	+-