commit f086542f31e6f088370a8fb7eefe28f446e9b4cb
parent a82abb4dcb78d08961369a5360b5cedaf14bc9c7
Author: Andres Navarro <canavarro82@gmail.com>
Date:   Wed, 23 Mar 2011 16:12:56 -0300
Modified symbol internal struct to include an array.
Added non-identifier symbols ouput only representation to kwrite.
Diffstat:
9 files changed, 131 insertions(+), 52 deletions(-)
diff --git a/src/Makefile b/src/Makefile
@@ -48,7 +48,8 @@ ktoken.o: ktoken.c ktoken.h kobject.h kstate.h kpair.h kstring.h ksymbol.h \
 kpair.o: kpair.c kpair.h kobject.h kstate.h kmem.h klisp.h
 kstring.o: kstring.c kstring.h kobject.h kstate.h kmem.h klisp.h
 # XXX: kpair.h because of use of list as symbol table
-ksymbol.o: ksymbol.c ksymbol.h kobject.h kpair.h kstate.h kmem.h klisp.h
+ksymbol.o: ksymbol.c ksymbol.h kobject.h kpair.h kstring.h kstate.h kmem.h \
+	klisp.h
 kread.o: kread.c kread.h kobject.h ktoken.h kpair.h kstate.h kerror.h klisp.h \
 	kport.h
 kwrite.o: kwrite.c kwrite.h kobject.h kpair.h kstring.h kstate.h kerror.h \
diff --git a/src/kobject.h b/src/kobject.h
@@ -249,15 +249,12 @@ typedef struct __attribute__ ((__packed__)) {
     TValue si; /* source code info (either () or (filename line col) */
 } Pair;
 
-/* XXX: Symbol should probably contain a String instead of a char buf */
 typedef struct __attribute__ ((__packed__)) {
     CommonHeader;
     TValue mark; /* for cycle/sharing aware algorithms */
-    uint32_t size;
-    char b[];
+    TValue str; /* could use String * here, but for now... */
 } Symbol;
 
-
 typedef struct __attribute__ ((__packed__)) {
     CommonHeader;
     TValue mark; /* for cycle/sharing aware algorithms */
@@ -477,6 +474,11 @@ extern char *ktv_names[];
 #define gch_get_flags(o_) (obj2gch(o_)->flags)
 #define tv_get_flags(o_) (gch_get_flags(tv2gch(o_)))
 
+/* Flags for symbols */
+/* has external representation (identifiers) */
+#define K_FLAG_EXT_REP 0x01
+#define khas_ext_rep(s_) ((tv_get_flags(s_) & K_FLAG_EXT_REP) != 0)
+
 /* Flags for marking continuations */
 #define K_FLAG_OUTER 0x01
 #define K_FLAG_INNER 0x02
diff --git a/src/kstate.c b/src/kstate.c
@@ -439,7 +439,8 @@ void klisp_close (klisp_State *K)
 	    klispM_free(K, (Pair *)obj);
 	    break;
 	case K_TSYMBOL:
-	    klispM_freemem(K, obj, sizeof(Symbol)+obj->sym.size+1);
+	    /* The string will be freed before/after */
+	    klispM_free(K, (Symbol *)obj);
 	    break;
 	case K_TSTRING:
 	    klispM_freemem(K, obj, sizeof(String)+obj->str.size+1);
diff --git a/src/kstring.h b/src/kstring.h
@@ -19,8 +19,8 @@ TValue kstring_new(klisp_State *K, const char *buf, uint32_t size);
 TValue kstring_new_g(klisp_State *K, uint32_t size);
 TValue kstring_new_sc(klisp_State *K, uint32_t size, char fill);
 
-#define kstring_buf(tv_) (((String *) ((tv_).tv.v.gc))->b)
-#define kstring_size(tv_) (((String *) ((tv_).tv.v.gc))->size)
+#define kstring_buf(tv_) (tv2str(tv_)->b)
+#define kstring_size(tv_) (tv2str(tv_)->size)
 
 #define kstring_is_empty(tv_) (kstring_size(tv_) == 0)
 
diff --git a/src/ksymbol.c b/src/ksymbol.c
@@ -8,45 +8,101 @@
 
 #include "ksymbol.h"
 #include "kobject.h"
+/* for identifier checking */
+#include "ktoken.h"
 #include "kpair.h"
 #include "kstate.h"
 #include "kmem.h"
 
-TValue ksymbol_new(klisp_State *K, const char *buf)
+TValue ksymbol_new_g(klisp_State *K, const char *buf, int32_t size, 
+		     bool identifierp)
 {
     /* TODO: replace symbol list with hashtable */
     /* First look for it in the symbol table */
     TValue tbl = K->symbol_table;
+
     while (!ttisnil(tbl)) {
 	TValue first = kcar(tbl);
-	/* NOTE: there are no embedded '\0's in symbols */
-	if (strcmp(buf, tv2sym(first)->b) == 0)
+	/* NOTE: there are no embedded '\0's in identifiers but 
+	 they could be in other symbols */
+	if (size == ksymbol_size(first) && 
+	    memcmp(buf, ksymbol_buf(first), size) == 0) {
 	    return first;
-	else
+	} else
 	    tbl = kcdr(tbl);
     }
 
-    /* Didn't find it, alloc new and save in symbol table */
+    /* Didn't find it, alloc new string and save in symbol table */
     /* NOTE: there are no embedded '\0's in symbols */
-    int32_t size = strlen(buf);
-    Symbol *new_sym = klispM_malloc(K, sizeof(Symbol) + size + 1);
-
+    /* GC: root new_str */
+    TValue new_str = kstring_new(K, buf, size); /* this copies the buf */
+    Symbol *new_sym = klispM_new(K, Symbol);
     
     /* header + gc_fields */
     new_sym->next = K->root_gc;
     K->root_gc = (GCObject *)new_sym;
     new_sym->gct = 0;
     new_sym->tt = K_TSYMBOL;
-    new_sym->flags = 0;
+    new_sym->flags = identifierp? K_FLAG_EXT_REP : 0; 
 
     /* symbol specific fields */
     new_sym->mark = KFALSE;
-    new_sym->size = size;
-    memcpy(new_sym->b, buf, size);
-    new_sym->b[size] = '\0';
+    new_sym->str = new_str;
 
     TValue new_symv = gc2sym(new_sym);
-    /* XXX: new_symv unrooted */
+    /* GC: root new_symb */
     K->symbol_table = kcons(K, new_symv, K->symbol_table);
     return new_symv;
 }
+
+/* for indentifiers */
+TValue ksymbol_new_i(klisp_State *K, const char *buf, int32_t size)
+{
+    return ksymbol_new_g(K, buf, size, true);
+}
+
+/* for indentifiers with no size */
+TValue ksymbol_new(klisp_State *K, const char *buf)
+{
+    int32_t size = (int32_t) strlen(buf);
+    return ksymbol_new_g(K, buf, size, true);
+}
+
+/* for string->symbol */
+TValue ksymbol_new_check_i(klisp_State *K, TValue str)
+{
+    int32_t size = kstring_size(str);
+    char *buf = kstring_buf(str);
+    bool identifierp;
+    
+    /* this is necessary because the empty symbol isn't an identifier */
+    /* MAYBE it should throw an error if the string is empty */
+    /* XXX: The exact syntax for identifiers isn't there in the report
+       yet, here we use something like scheme, and the same as in ktoken.h
+       (details, leading numbers '.', '+' and '-' are a no go, but '+' and
+       '-' are an exception.
+    */
+    identifierp = (size > 0);
+    if (identifierp) {
+	char first = *buf;
+	buf++;
+	size--;
+	if (first == '+' || first == '-')
+	    identifierp = (size == 0);
+	else if (first == '.' || ktok_is_numeric(first))
+	    identifierp = false;
+	else 
+	    identifierp = ktok_is_subsequent(first);
+
+	while(identifierp && size--) {
+	    if (ktok_is_subsequent(*buf))
+		identifierp = false;
+	    else
+		buf++;
+	}
+    }
+    /* recover size & buf*/
+    size = kstring_size(str);
+    buf = kstring_buf(str);
+    return ksymbol_new_g(K, buf, size, identifierp);
+}
diff --git a/src/ksymbol.h b/src/ksymbol.h
@@ -9,11 +9,18 @@
 
 #include "kobject.h"
 #include "kstate.h"
+#include "kstring.h"
 #include "kmem.h"
 
 /* TEMP: for now all symbols are interned */
+/* For identifiers */
+TValue ksymbol_new_i(klisp_State *K, const char *buf, int32_t size);
+/* For identifiers, simplified for unknown size */
 TValue ksymbol_new(klisp_State *K, const char *buf);
+/* For general strings */
+TValue ksymbol_new_check_i(klisp_State *K, TValue str);
 
-#define ksymbol_buf(tv_) (((Symbol *) ((tv_).tv.v.gc))->b)
+#define ksymbol_buf(tv_) (kstring_buf(tv2sym(tv_)->str))
+#define ksymbol_size(tv_) (kstring_size(tv2sym(tv_)->str))
 
 #endif
diff --git a/src/ktoken.c b/src/ktoken.c
@@ -44,26 +44,14 @@
 ** Char sets for fast ASCII char classification
 */
 
-/* Each bit correspond to a char in the 0-255 range */
-typedef uint32_t kcharset[8];
-
 /*
 ** Char set function/macro interface
 */
 void kcharset_empty(kcharset);
 void kcharset_fill(kcharset, char *);
 void kcharset_union(kcharset, kcharset);
-#define kcharset_contains(kch_, ch_) \
-    ({ unsigned char ch__ = (unsigned char) (ch_);	\
-	kch_[KCHS_OCTANT(ch__)] & KCHS_BIT(ch__); })
-
+/* contains in .h */
     
-/*
-** Char set contains macro interface
-*/
-#define KCHS_OCTANT(ch) ((ch) >> 5)
-#define KCHS_BIT(ch) (1 << ((ch) & 0x1f))
-
 void kcharset_empty(kcharset chs)
 {
     for (int i = 0; i < 8; i++) {
@@ -96,16 +84,6 @@ void kcharset_union(kcharset chs, kcharset chs2)
 kcharset ktok_alphabetic, ktok_numeric, ktok_whitespace;
 kcharset ktok_delimiter, ktok_extended, ktok_subsequent;
 
-#define ktok_is_alphabetic(chi_) kcharset_contains(ktok_alphabetic, chi_)
-/* TODO: add is_digit, that takes the base as parameter */
-#define ktok_is_numeric(chi_) kcharset_contains(ktok_numeric, chi_)
-/* TODO: add hex digits */
-#define ktok_digit_value(ch_) (ch_ - '0')
-#define ktok_is_whitespace(chi_) kcharset_contains(ktok_whitespace, chi_)
-#define ktok_is_delimiter(chi_) ((chi_) == EOF ||			\
-				 kcharset_contains(ktok_delimiter, chi_))
-#define ktok_is_subsequent(chi_) kcharset_contains(ktok_subsequent, chi_)
-
 /*
 ** Special Tokens 
 **
@@ -408,7 +386,7 @@ TValue ktok_read_maybe_signed_numeric(klisp_State *K)
     if (ktok_check_delimiter(K)) {
 	ks_tbadd(K, ch);
 	ks_tbadd(K, '\0');
-	TValue new_sym = ksymbol_new(K, ks_tbget_buffer(K));
+	TValue new_sym = ksymbol_new_i(K, ks_tbget_buffer(K), 1);
 	ks_tbclear(K);
 	return new_sym;
     } else {
@@ -626,18 +604,20 @@ TValue ktok_read_special(klisp_State *K)
 */
 TValue ktok_read_identifier(klisp_State *K)
 {
+    int32_t i = 1;
     while (!ktok_check_delimiter(K)) {
 	/* NOTE: can't be eof, because eof is a delimiter */
 	char ch = (char) ktok_getc(K);
 
 	/* NOTE: is_subsequent of '\0' is false, so no embedded '\0' */
-	if (ktok_is_subsequent(ch))
+	if (ktok_is_subsequent(ch)) {
 	    ks_tbadd(K, ch);
-	else
+	    i++;
+	} else
 	    ktok_error(K, "Invalid char in identifier");	    
     }
     ks_tbadd(K, '\0');
-    TValue new_sym = ksymbol_new(K, ks_tbget_buffer(K));
+    TValue new_sym = ksymbol_new_i(K, ks_tbget_buffer(K), i-1);
     ks_tbclear(K);
     return new_sym;
 }
diff --git a/src/ktoken.h b/src/ktoken.h
@@ -23,5 +23,34 @@ TValue ktok_get_source_info(klisp_State *K);
 /* This is needed here to allow cleanup of shared dict from tokenizer */
 void clear_shared_dict(klisp_State *K);
 
+/* This is needed for string->symbol to check if a symbol has external
+   representation as an identifier */
+/* REFACTOR: think out a better interface to all this */
+
+/* Each bit correspond to a char in the 0-255 range */
+typedef uint32_t kcharset[8];
+
+extern kcharset ktok_alphabetic, ktok_numeric, ktok_whitespace;
+extern kcharset ktok_delimiter, ktok_extended, ktok_subsequent;
+
+#define ktok_is_alphabetic(chi_) kcharset_contains(ktok_alphabetic, chi_)
+/* TODO: add is_digit, that takes the base as parameter */
+#define ktok_is_numeric(chi_) kcharset_contains(ktok_numeric, chi_)
+/* TODO: add hex digits */
+#define ktok_digit_value(ch_) (ch_ - '0')
+#define ktok_is_whitespace(chi_) kcharset_contains(ktok_whitespace, chi_)
+#define ktok_is_delimiter(chi_) ((chi_) == EOF ||			\
+				 kcharset_contains(ktok_delimiter, chi_))
+#define ktok_is_subsequent(chi_) kcharset_contains(ktok_subsequent, chi_)
+
+#define kcharset_contains(kch_, ch_) \
+    ({ unsigned char ch__ = (unsigned char) (ch_);	\
+	kch_[KCHS_OCTANT(ch__)] & KCHS_BIT(ch__); })
+
+/*
+** Char set contains macro interface
+*/
+#define KCHS_OCTANT(ch) ((ch) >> 5)
+#define KCHS_BIT(ch) (1 << ((ch) & 0x1f))
 
 #endif
diff --git a/src/kwrite.c b/src/kwrite.c
@@ -188,9 +188,12 @@ void kwrite_simple(klisp_State *K, TValue obj)
 	kw_printf(K, "#%c", bvalue(obj)? 't' : 'f');
 	break;
     case K_TSYMBOL:
-	/* TEMP: access symbol structure directly */
-	/* TEMP: for now assume all symbols have external representations */
-	kw_printf(K, "%s", ksymbol_buf(obj));
+	if (khas_ext_rep(obj)) {
+	    /* TEMP: access symbol structure directly */
+	    kw_printf(K, "%s", ksymbol_buf(obj));
+	} else {
+	    kw_printf(K, "#[symbol]");
+	}
 	break;
     case K_TINERT:
 	kw_printf(K, "#inert");