(almost complete) Tokenizer added. Constructors for pair, string and boolean. Some more macros in kobject.h. - klisp - an open source interpreter for the Kernel Programming Language.

commit 3a588bb82a1b9248f166c0c3f814ce65008a59d0
parent f25090d64f44c1349a200acff7648ff1676a64ff
Author: Andres Navarro <canavarro82@gmail.com>
Date:   Thu, 17 Feb 2011 17:08:38 -0300

(almost complete) Tokenizer added. Constructors for pair, string and boolean. Some more macros in kobject.h.

Diffstat:
M src/Makefile  | 12 +++++++++---
M src/klisp.c  | 54 +++++++++++++++++++-----------------------------------
A src/kobject.c  | 46 ++++++++++++++++++++++++++++++++++++++++++++++
M src/kobject.h  | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------
A src/kpair.c  | 27 +++++++++++++++++++++++++++
A src/kpair.h  | 25 +++++++++++++++++++++++++
A src/kstring.c  | 32 ++++++++++++++++++++++++++++++++
A src/kstring.h  | 15 +++++++++++++++
A src/ksymbol.c  | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
A src/ksymbol.h  | 19 +++++++++++++++++++
A src/ktoken.c  | 541 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/ktoken.h  | 21 +++++++++++++++++++++

12 files changed, 893 insertions(+), 73 deletions(-)
diff --git a/src/Makefile b/src/Makefile
@@ -7,7 +7,7 @@ MYCFLAGS=
 MYLDFLAGS=
 MYLIBS=
 
-CORE_O=	
+CORE_O=	kobject.o ktoken.o kpair.o kstring.o ksymbol.o
 
 KRN_T=	klisp
 KRN_O=	klisp.o
@@ -23,7 +23,7 @@ all:	$(ALL_T)
 o:	$(ALL_O)
 
 $(KRN_T): $(ALL_O)
-	$(CC) -o $@ $(MYLDFLAGS) $(KRN_O) $(LIBS)
+	$(CC) -o $@ $(MYLDFLAGS) $(ALL_O) $(LIBS)
 
 clean:
 	$(RM) $(ALL_T) $(ALL_O)
@@ -31,4 +31,10 @@ clean:
 # list targets that do not create files (but not all makes understand .PHONY)
 .PHONY: all default o clean
 
-klisp.o: klisp.c klisp.h kobject.h
+klisp.o: klisp.c klisp.h kobject.h ktoken.h
+kobject.o: kobject.c kobject.h
+ktoken.o: ktoken.c ktoken.h kobject.h kpair.h kstring.h ksymbol.h
+kpair.o: kpair.c kpair.h kobject.h
+kstring.o: kstring.c kstring.h kobject.h
+# XXX: kpair.h because of use of list as symbol table
+ksymbol.o: ksymbol.c ksymbol.h kobject.h kpair.h
diff --git a/src/klisp.c b/src/klisp.c
@@ -7,44 +7,28 @@
 #include <stdio.h>
 
 #include "kobject.h"
+#include "ktoken.h"
 
 int main(int argc, char *argv[]) 
 {
-    printf("Tests\n");
-
-    printf("\nVariables: \n");
-    printf("nil: %d\n", ttisnil(knil));
-    printf("ignore: %d\n", ttisignore(kignore));
-    printf("inert: %d\n", ttisinert(kinert));
-    printf("eof: %d\n", ttiseof(keof));
-    printf("true: %d\n", ttisboolean(ktrue));
-    printf("false: %d\n", ttisboolean(kfalse));
-
-
-    printf("\nConstants: \n");
-
-    printf("nil: %d\n", ttisnil(KNIL));
-    printf("ignore: %d\n", ttisignore(KIGNORE));
-    printf("inert: %d\n", ttisinert(KINERT));
-    printf("eof: %d\n", ttiseof(KEOF));
-    printf("true: %d\n", ttisboolean(KTRUE));
-    printf("false: %d\n", ttisboolean(KFALSE));
-
-    printf("int: %d\n", 
-	   ttisfixint(((TValue) {.tv = {.t = K_TAG_FIXINT, .v = { .i = 3}}})));
-    printf("double: %d\n", ttisdouble((TValue){.d = 1.0}));
-
-    printf("\nSwitch: \n");
-
-    printf("nil: %d\n", ttype(KNIL));
-    printf("ignore: %d\n", ttype(KIGNORE));
-    printf("inert: %d\n", ttype(KINERT));
-    printf("eof: %d\n", ttype(KEOF));
-    printf("true: %d\n", ttype(KTRUE));
-    printf("false: %d\n", ttype(KFALSE));
-    printf("int: %d\n", 
-	   ttype(((TValue) {.tv = {.t = K_TAG_FIXINT, .v = { .i = 3}}})));
-    printf("double: %d\n", ttype((TValue){.d = 1.0}));
+    /*
+    ** Simple tokenizer loop
+    */
+    printf("Tokenizer Type Test\n");
+    
+    ktok_file = stdin;
+    ktok_init();
+
+    TValue tok = KNIL;
+
+    while(!ttiseof(tok)) {
+	tok = ktok_read_token();
+	if (ttisnil(tok)) {
+	    /* there was an error */
+	    break;
+	}
+	printf("\nToken Type: %s\n", ttname(tok));
+    }
 
     return 0;
 }
diff --git a/src/kobject.c b/src/kobject.c
@@ -0,0 +1,46 @@
+/*
+** kobject.h
+** Type definitions for Kernel Objects
+** See Copyright Notice in klisp.h
+*/
+
+#include "kobject.h"
+
+/*
+** The global const variables
+*/
+const TValue knil = KNIL_;
+const TValue kignore = KIGNORE_;
+const TValue kinert = KINERT_;
+const TValue keof = KEOF_;
+const TValue ktrue = KTRUE_;
+const TValue kfalse = KFALSE_;
+const TValue kepinf = KEPINF_;
+const TValue keminf = KEMINF_;
+
+/*
+** The name strings for all TValue types
+*/
+char *ktv_names[] = {
+    [K_TFIXINT] = "fixint",
+    [K_TBIGINT] = "bigint", 
+    [K_TFIXRAT] = "fixrat", 
+    [K_TBIGRAT] = "bigrat", 
+    [K_TEINF] = "einf", 
+    [K_TDOUBLE] = "double", 
+    [K_TBDOUBLE] = "bdouble", 
+    [K_TIINF] = "iinf", 
+    [K_TRWNPN] = "rwnpn", 
+    [K_TCOMPLEX] = "complex", 
+
+    [K_TNIL] = "nil",       
+    [K_TIGNORE] = "ignore",
+    [K_TINERT] = "inert", 
+    [K_TEOF] = "eof", 
+    [K_TBOOLEAN] = "boolean", 
+    [K_TCHAR] = "char", 
+
+    [K_TPAIR] = "pair", 
+    [K_TSTRING] = "string", 
+    [K_TSYMBOL] = "symbol"
+};
diff --git a/src/kobject.h b/src/kobject.h
@@ -6,7 +6,7 @@
 
 /*
 ** SOURCE NOTE: While the tagging system comes from Mozilla TraceMonkey,
-** o code from TraceMonkey was used.
+** no code from TraceMonkey was used.
 ** The general structure, names and comments of this file follow the 
 ** scheme of Lua. 
 */
@@ -52,16 +52,14 @@ typedef struct __attribute__ ((__packed__)) GCheader {
 
 /*
 ** Tags: Types & Flags
-*/
-
-/*
-** Tagged values in 64 bits (for 32 bit systems)
-** NaN boxing: Values are encoded as double precision NaNs
-** There is one canonical NaN that is used through the interpreter
-** and all remaining NaNs are used to encode the rest of the types
-** (other than double)
-** Canonical NaN: (0)(111 1111 1111) 1000  0000 0000 0000 0000 32(0)
-** Infinities: s(111 1111 1111) 0000  0000 0000 0000 0000 32(0)
+**
+** Nan Boxing: Tagged values in 64 bits (for 32 bit systems)
+** All Values except doubles are encoded as double precision NaNs
+** There is one canonical NaN(?maybe none?) that is used through the 
+** interpreter and all remaining NaNs are used to encode the rest of 
+** the types (other than double)
+** Canonical NaN(?): (0)(111 1111 1111) 1000  0000 0000 0000 0000 32(0)
+** Infinities(?): s(111 1111 1111) 0000  0000 0000 0000 0000 32(0)
 ** Tagged values: (0)(111 1111 1111) 1111  tttt tttt tttt tttt 32(v)
 ** So all tags start with 0x7fff which leaves us 16 bits for the 
 ** tag proper.
@@ -83,6 +81,7 @@ typedef struct __attribute__ ((__packed__)) GCheader {
 #define K_TAG_BASE_TYPE(t) ((t) & K_TAG_BASE_TYPE_MASK)
 
 /*
+** RATIONALE:
 ** Number types are first and ordered to allow easy switch statements
 ** in arithmetic operators. The ones marked with (?) are still in
 ** consideration for separate type tags.
@@ -119,8 +118,15 @@ typedef struct __attribute__ ((__packed__)) GCheader {
 
 #define K_MAKE_VTAG(t) (K_TAG_TAGGED | t)
 
-/* TODO: For now we will only use fixints */
+/*
+** TODO: 
+**
+** - decide if inexact infinities and reals with no
+**    primary values are included in K_TDOUBLE
+** - For now we will only use fixints and exact infinities 
+*/
 #define K_TAG_FIXINT	K_MAKE_VTAG(K_TFIXINT)
+#define K_TAG_EINF	K_MAKE_VTAG(K_TEINF)
 
 #define K_TAG_NIL	K_MAKE_VTAG(K_TNIL)
 #define K_TAG_IGNORE	K_MAKE_VTAG(K_TIGNORE)
@@ -137,17 +143,11 @@ typedef struct __attribute__ ((__packed__)) GCheader {
 ** Macros to test types
 */
 
-/*
-** This is intended for use in switch statements
-** TODO: decide if inexact infinities and reals with no
-** primary values are included in K_TDOUBLE
-*/
+/* NOTE: This is intended for use in switch statements */
 #define ttype(o) ({ TValue o_ = o;			\
 	    ttisdouble(o_)? K_TDOUBLE : ttype_(o_); })
 
-/*
-** This is intended for internal use below. DON'T USE OUTSIDE THIS FILE
-*/
+/* This is intended for internal use below. DON'T USE OUTSIDE THIS FILE */
 #define ttag(o) ((o).tv.t)
 #define ttype_(o) (K_TAG_TYPE(ttag(o)))
 #define tflag_(o) (K_TAG_FLAG(ttag(o)))
@@ -175,7 +175,7 @@ typedef struct __attribute__ ((__packed__)) GCheader {
 typedef union {
     bool b;
     int32_t i;
-    unsigned char ch;
+    char ch;
     GCObject *gc; 
     void *p; 
     /* ... */
@@ -206,21 +206,30 @@ typedef struct __attribute__ ((__packed__)) {
     TValue cdr;
 } Pair;
 
+/* XXX: Symbol should probably contain a String instead of a char buf */
 typedef struct __attribute__ ((__packed__)) {
     CommonHeader;
-    unsigned char b[]; // buffer
+    uint32_t size;
+    char b[];
 } Symbol;
 
+/* 
+** RATIONALE: 
+**
+** Storing size allows embedded '\0's.
+** Note, however, that there are actually size + 1 bytes allocated
+** and that b[size] = '\0'. This is useful for printing strings
+** 
+*/
 typedef struct __attribute__ ((__packed__)) {
     CommonHeader;
-    uint32_t size; // to allow embedded '\0'
-    unsigned char b[]; // buffer
+    uint32_t size; 
+    char b[]; // buffer
 } String;
 
 /*
 ** Union of all Kernel heap-allocated values
 */
-
 /* LUA NOTE: In Lua the corresponding union is in lstate.h */
 union GCObject {
     GCheader gch;
@@ -239,22 +248,66 @@ union GCObject {
 #define KEOF_ {.tv = {.t = K_TAG_EOF, .v = { .i = 0 }}}
 #define KTRUE_ {.tv = {.t = K_TAG_BOOLEAN, .v = { .b = true }}}
 #define KFALSE_ {.tv = {.t = K_TAG_BOOLEAN, .v = { .b = false }}}
+#define KEPINF_ {.tv = {.t = K_TAG_EINF, .v = { .i = 1 }}}
+#define KEMINF_ {.tv = {.t = K_TAG_EINF, .v = { .i = -1 }}}
 
+/* RATIONALE: the ones above can be used in initializers */
 #define KNIL ((TValue) KNIL_)
 #define KINERT ((TValue) KINERT_)
 #define KIGNORE ((TValue) KIGNORE_)
 #define KEOF ((TValue) KEOF_)
 #define KTRUE ((TValue) KTRUE_)
 #define KFALSE ((TValue) KFALSE_)
-
-/*
-** The same constants as global const variables
-*/
-const TValue knil = KNIL_;
-const TValue kignore = KIGNORE_;
-const TValue kinert = KINERT_;
-const TValue keof = KEOF_;
-const TValue ktrue = KTRUE_;
-const TValue kfalse = KFALSE_;
+#define KEPINF ((TValue) KEPINF_)
+#define KEMINF ((TValue) KEMINF_)
+
+/* The same constants as global const variables */
+const TValue knil;
+const TValue kignore;
+const TValue kinert;
+const TValue keof;
+const TValue ktrue;
+const TValue kfalse;
+const TValue kepinf;
+const TValue keminf;
+
+/* Macros to create TValues of non-heap allocated types (for initializers) */
+#define ch2tv_(ch_) {.tv = {.t = K_TAG_CHAR, .v = { .ch = ch_ }}}
+#define i2tv_(i_) {.tv = {.t = K_TAG_FIXINT, .v = { .i = i_ }}}
+#define b2tv_(b_) {.tv = {.t = K_TAG_BOOLEAN, .v = { .b = b_ }}}
+
+/* Macros to create TValues of non-heap allocated types */
+#define ch2tv(ch_) ((TValue) ch2tv_(ch_))
+#define i2tv(i_) ((TValue) i2tv_(i_))
+#define b2tv(b_) ((TValue) b2tv_(b_))
+
+/* Macros to convert a GCObject * into a tagged value */
+/* TODO: add assertions */
+/* LUA NOTE: the corresponding defines are in lstate.h */
+#define gc2tv(t_, o_) ((TValue) {.tv = {.t = t_, \
+					.v = { .gc = obj2gco(o_)}}})
+#define gc2pair(o_) (gc2tv(K_TAG_PAIR, o_))
+#define gc2str(o_) (gc2tv(K_TAG_STRING, o_))
+#define gc2sym(o_) (gc2tv(K_TAG_SYMBOL, o_))
+
+/* Macro to convert a TValue into a specific heap allocated object */
+#define tv2pair(v_) ((Pair *) gcvalue(v_))
+#define tv2str(v_) ((String *) gcvalue(v_))
+#define tv2sym(v_) ((Symbol *) gcvalue(v_))
+
+/* Macro to convert any Kernel object into a GCObject */
+#define obj2gco(v_) ((GCObject *) (v_))
+
+/* Macros to access innertv values */
+/* TODO: add assertions */
+#define ivalue(o_) ((o_).tv.v.i)
+#define bvalue(o_) ((o_).tv.v.b)
+#define chvalue(o_) ((o_).tv.v.ch)
+#define gcvalue(o_) ((o_).tv.v.gc)
+
+/* Macro to obtain a string describing the type of a TValue */#
+#define ttname(tv_) (ktv_names[ttype(tv_)])
+
+extern char *ktv_names[];
 
 #endif
diff --git a/src/kpair.c b/src/kpair.c
@@ -0,0 +1,27 @@
+/*
+** kpair.c
+** Kernel Pairs
+** See Copyright Notice in klisp.h
+*/
+
+/* XXX: for malloc */
+#include <stdlib.h>
+/* TODO: use a generalized alloc function */
+
+#include "kpair.h"
+#include "kobject.h"
+
+/* TODO: Out of memory errors */
+/* XXX: for now all pairs are mutable */
+TValue kcons(TValue car, TValue cdr) 
+{
+    Pair *new_pair = malloc(sizeof(Pair));
+
+    new_pair->next = NULL;
+    new_pair->gct = 0;
+    new_pair->tt = K_TPAIR;
+    new_pair->car = car;
+    new_pair->cdr = cdr;
+
+    return gc2pair(new_pair);
+}
diff --git a/src/kpair.h b/src/kpair.h
@@ -0,0 +1,25 @@
+/*
+** kpair.h
+** Kernel Pairs
+** See Copyright Notice in klisp.h
+*/
+
+#ifndef kpair_h
+#define kpair_h
+
+#include "kobject.h"
+
+/* TODO: add type assertions */
+/* TODO: add more kc[ad]*r combinations */
+#define kcar(p_) (((Pair *)(p_.tv.v.gc))->car)
+#define kcdr(p_) (((Pair *)(p_.tv.v.gc))->cdr)
+
+#define kset_car(p_, v_) (kcar(p_) = v_)
+#define kset_cdr(p_, v_) (kcdr(p_) = v_)
+
+#define kdummy_cons (kcons(KNIL, KNIL))
+
+/* XXX: for now all pairs are mutable */
+TValue kcons(TValue, TValue);
+
+#endif
diff --git a/src/kstring.c b/src/kstring.c
@@ -0,0 +1,32 @@
+/*
+** kstring.c
+** Kernel Strings
+** See Copyright Notice in klisp.h
+*/
+
+/* XXX: for malloc */
+#include <stdlib.h>
+/* TODO: use a generalized alloc function */
+
+#include <string.h>
+
+#include "kstring.h"
+#include "kobject.h"
+
+/* TODO: Out of memory errors */
+/* XXX: for now all strings are mutable */
+TValue kstring_new(const char *buf, uint32_t size)
+{
+    String *new_str = malloc(sizeof(String) + size + 1);
+
+    new_str->next = NULL;
+    new_str->gct = 0;
+    new_str->tt = K_TSTRING;
+    new_str->size = size;
+    /* NOTE: there can be embedded '\0's in a string */
+    memcpy(new_str->b, buf, size);
+    /* NOTE: they still end with a '\0' for convenience in printing */
+    new_str->b[size] = '\0';
+
+    return gc2str(new_str);
+}
diff --git a/src/kstring.h b/src/kstring.h
@@ -0,0 +1,15 @@
+/*
+** kstring.h
+** Kernel Strings
+** See Copyright Notice in klisp.h
+*/
+
+#ifndef kstring_h
+#define kstring_h
+
+#include "kobject.h"
+
+/* XXX: for now all strings are mutable */
+TValue kstring_new(const char *, uint32_t);
+
+#endif
diff --git a/src/ksymbol.c b/src/ksymbol.c
@@ -0,0 +1,51 @@
+/*
+** ksymbol.c
+** Kernel Symbols
+** See Copyright Notice in klisp.h
+*/
+
+/* XXX: for malloc */
+#include <stdlib.h>
+/* TODO: use a generalized alloc function */
+
+#include <string.h>
+
+#include "ksymbol.h"
+#include "kobject.h"
+#include "kpair.h"
+
+/* TODO: replace the list with a hashtable */
+/* TODO: move to global state */
+TValue ksymbol_table = KNIL_;
+
+/* TODO: Out of memory errors */
+TValue ksymbol_new(const char *buf)
+{
+    /* First look for it in the symbol table */
+    TValue tbl = ksymbol_table;
+    while (!ttisnil(tbl)) {
+	TValue first = kcar(tbl);
+	/* NOTE: there are no embedded '\0's in symbols */
+	if (strcmp(buf, tv2sym(first)->b) == 0)
+	    return first;
+	else
+	    tbl = kcdr(tbl);
+    }
+
+    /* Didn't find it, alloc new and save in symbol table */
+    /* NOTE: there are no embedded '\0's in symbols */
+    int32_t size = strlen(buf);
+    Symbol *new_sym = malloc(sizeof(Symbol) + size + 1);
+
+    new_sym->next = NULL;
+    new_sym->gct = 0;
+    new_sym->tt = K_TSYMBOL;
+    new_sym->size = size;
+    memcpy(new_sym->b, buf, size);
+    new_sym->b[size] = '\0';
+
+    TValue new_symv = gc2sym(new_sym);
+    tbl = kcons(new_symv, ksymbol_table);
+    ksymbol_table = tbl;
+    return new_symv;
+}
diff --git a/src/ksymbol.h b/src/ksymbol.h
@@ -0,0 +1,19 @@
+/*
+** ksymbol.h
+** Kernel Symbols
+** See Copyright Notice in klisp.h
+*/
+
+#ifndef ksymbol_h
+#define ksymbol_h
+
+#include "kobject.h"
+
+/* TODO: replace the list with a hashtable */
+/* TODO: move to global state */
+TValue ksymbol_table;
+
+/* XXX: for now all symbols are interned */
+TValue ksymbol_new(const char *);
+
+#endif
diff --git a/src/ktoken.c b/src/ktoken.c
@@ -0,0 +1,541 @@
+/*
+** ktoken.c
+** Tokenizer for the Kernel Programming Language
+** See Copyright Notice in klisp.h
+*/
+
+
+/*
+** TODO:
+**
+** - Support other number types besides fixints and exact infinities
+** - Support for complete number syntax (exactness, radix, etc)
+** - Support for unicode (strings, char and symbols).
+** - Error handling
+** - Source code tracking
+**
+*/
+#include <stdio.h>
+/* XXX for malloc */
+#include <stdlib.h>
+/* TODO: use a generalized alloc function */
+
+#include <string.h>
+#include <ctype.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "ktoken.h"
+#include "kobject.h"
+#include "kpair.h"
+#include "kstring.h"
+#include "ksymbol.h"
+
+/*
+** Char sets for fast ASCII char classification
+*/
+
+/* Each bit correspond to a char in the 0-255 range */
+typedef uint32_t kcharset[8];
+
+/*
+** Char set function/macro interface
+*/
+void kcharset_empty(kcharset);
+void kcharset_fill(kcharset, char *);
+void kcharset_union(kcharset, kcharset);
+#define kcharset_contains(kch_, ch_) \
+    ({ unsigned char ch__ = (unsigned char) ch_;	\
+	kch_[KCHS_OCTANT(ch__)] & KCHS_BIT(ch__); })
+
+    
+/*
+** Char set contains macro interface
+*/
+#define KCHS_OCTANT(ch) (ch >> 5)
+#define KCHS_BIT(ch) (1 << (ch & 0x1f))
+
+void kcharset_empty(kcharset chs)
+{
+    for (int i = 0; i < 8; i++) {
+	chs[i] = 0;
+    }
+}
+
+void kcharset_fill(kcharset chs, char *chars_)
+{
+    unsigned char *chars = (unsigned char *) chars_;
+    unsigned char ch;
+
+    kcharset_empty(chs);
+
+    while ((ch = *chars++)) {
+	chs[KCHS_OCTANT(ch)] |= KCHS_BIT(ch);
+    }
+}
+
+void kcharset_union(kcharset chs, kcharset chs2)
+{
+    for (int i = 0; i < 8; i++) {
+	chs[i] |= chs2[i];
+    }
+}
+
+/*
+** Character sets for classification
+*/
+kcharset ktok_alphabetic, ktok_numeric, ktok_whitespace;
+kcharset ktok_delimiter, ktok_extended, ktok_subsequent;
+
+#define ktok_is_alphabetic(chi_) kcharset_contains(ktok_alphabetic, chi_)
+/* TODO: add is_digit, that takes the base as parameter */
+#define ktok_is_numeric(chi_) kcharset_contains(ktok_numeric, chi_)
+/* TODO: add hex digits */
+#define ktok_digit_value(ch_) (ch_ - '0')
+#define ktok_is_whitespace(chi_) kcharset_contains(ktok_whitespace, chi_)
+#define ktok_is_delimiter(chi_) (chi == EOF || \
+				 kcharset_contains(ktok_delimiter, chi_))
+#define ktok_is_subsequent(chi_) kcharset_contains(ktok_subsequent, chi_)
+
+/*
+** Special Tokens
+*/
+
+/*
+** RATIONALE:
+**
+** Because a pair is not a token, they can be used to represent special tokens
+** instead of creating an otherwise useless special token type
+** lparen, rparen and dot are represented as a pair with the corresponding 
+** char in the car and nil in the cdr.
+** srfi-38 tokens are represented with a boolean in the indicating if it's a 
+** defining token and the number in the cdr
+**
+*/
+TValue ktok_lparen, ktok_rparen, ktok_dot;
+
+/* TODO: move this to the global state */
+char *ktok_buffer;
+/* WORKAROUND: for stdin line buffering & reading of EOF */
+bool ktok_seen_eof;
+uint32_t ktok_buffer_size;
+#define KTOK_BUFFER_INITIAL_SIZE 1024
+
+void ktok_init()
+{
+    /* WORKAROUND: for stdin line buffering & reading of EOF */
+    ktok_seen_eof = false;
+    /* string buffer */
+    /* XXX: for now use a fixed size */
+    ktok_buffer_size = KTOK_BUFFER_INITIAL_SIZE;
+    /* TODO: Out of memory errors */
+    ktok_buffer = malloc(KTOK_BUFFER_INITIAL_SIZE);
+
+    /* Character sets */
+    kcharset_fill(ktok_alphabetic, "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+		  "abcdefghijklmnopqrstuvwxyz");
+    kcharset_fill(ktok_numeric, "0123456789");
+    kcharset_fill(ktok_whitespace, " \t\v\r\n\f");
+
+    kcharset_fill(ktok_delimiter, "()\";");
+    kcharset_union(ktok_delimiter, ktok_whitespace);
+
+    kcharset_fill(ktok_extended, "!$%&*+-./:<=>?@^_~");
+
+    kcharset_empty(ktok_subsequent);
+    kcharset_union(ktok_subsequent, ktok_alphabetic);
+    kcharset_union(ktok_subsequent, ktok_numeric);
+    kcharset_union(ktok_subsequent, ktok_extended);
+
+    /* Special Tokens */
+    ktok_lparen = kcons(ch2tv('('), KNIL);
+    ktok_rparen = kcons(ch2tv(')'), KNIL);
+    ktok_dot = kcons(ch2tv('.'), KNIL);
+}
+
+/*
+** Underlying stream interface
+*/
+int ktok_getc();
+int ktok_peekc();
+
+int ktok_getc() {
+    /* TODO: add location tracking */
+    /* WORKAROUND: for stdin line buffering & reading of EOF */
+    if (ktok_seen_eof) {
+	return EOF;
+    } else {
+	int chi = getc(ktok_file);
+	ktok_seen_eof = (chi == EOF);
+	return chi;
+    }
+}
+
+int ktok_peekc() {
+    int chi = ktok_getc();
+    if (chi != EOF)
+	ungetc(chi, ktok_file);
+    return chi;
+}
+
+/*
+** Error management
+*/
+TValue ktok_error(char *str)
+{
+    /* TODO: Decide on error handling mechanism for reader (& tokenizer) */
+    printf("TOK ERROR: %s\n", str);
+    return KNIL;
+}
+
+
+/*
+** ktok_read_token() helpers
+*/
+void ktok_ignore_whitespace_and_comments();
+bool ktok_check_delimiter();
+TValue ktok_read_string();
+TValue ktok_read_special();
+TValue ktok_read_number(bool);
+TValue ktok_read_maybe_signed_numeric();
+TValue ktok_read_identifier();
+int ktok_read_until_delimiter();
+
+/*
+** Main tokenizer function
+*/
+TValue ktok_read_token ()
+{
+    ktok_ignore_whitespace_and_comments();
+    /*
+    ** NOTE: We jumped over all whitespace
+    ** so either the next token starts here or eof was reached, 
+    ** in any case we save the location of the port
+    */
+
+    /* TODO: add location tracking */
+    /* SCHEME VERSION   (kport-save-loc! kport) */
+
+    int chi = ktok_peekc();
+
+    switch(chi) {
+    case EOF:
+	ktok_getc();
+	return KEOF;
+    case '(':
+	ktok_getc();
+	return ktok_lparen;
+    case ')':
+	ktok_getc();
+	return ktok_rparen;
+    case '.':
+	ktok_getc();
+	if (ktok_check_delimiter())
+	    return ktok_dot;
+	else
+	    return ktok_error("no delimiter found after dot");
+    case '"':
+	return ktok_read_string();
+    case '#':
+	return ktok_read_special();
+    case '0': case '1': case '2': case '3': case '4': 
+    case '5': case '6': case '7': case '8': case '9':
+	return ktok_read_number(true); /* positive number */
+    case '+': case '-':
+	return ktok_read_maybe_signed_numeric();
+    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 
+    case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': 
+    case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': 
+    case 'V': case 'W': case 'X': case 'Y': case 'Z': 
+    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 
+    case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 
+    case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': 
+    case 'v': case 'w': case 'x': case 'y': case 'z': 
+    case '!': case '$': case '%': case '&': case '*': case '/': case ':': 
+    case '<': case '=': case '>': case '?': case '@': case '^': case '_': 
+    case '~': 
+	/*
+	** NOTE: the cases for '+', '-', '.' and numbers were already 
+	** considered so identifier-subsequent is used instead of 
+	** identifier-first-char (in the cases above)
+	*/
+	return ktok_read_identifier();
+    default:
+	ktok_getc();
+	return ktok_error("unrecognized token starting char");
+    }
+}
+
+/*
+** Comments and Whitespace
+*/
+void ktok_ignore_comment()
+{
+    int chi;
+    do {
+	chi = ktok_getc();
+    } while (chi != EOF && chi != '\n');
+}
+
+void ktok_ignore_whitespace_and_comments()
+{
+    /* NOTE: if it's not a whitespace or comment do nothing (even on eof) */
+    bool end = false;
+    while(!end) {
+	int chi = ktok_peekc();
+
+	if (chi == EOF) {
+	    end = true; 
+	} else {
+	    char ch = (char) chi;
+	    if (ktok_is_whitespace(ch)) {
+		ktok_getc();
+	    } else if (ch == ';') {
+		ktok_ignore_comment(); /* NOTE: this also reads again the ';' */
+	    } else {
+		end = true;
+	    }
+	}
+    }
+}
+
+/*
+** Delimiter checking
+*/
+bool ktok_check_delimiter()
+{
+    int chi = ktok_peekc();
+    return (ktok_is_delimiter(chi));
+}
+
+/*
+** Returns the number of bytes read
+*/
+int ktok_read_until_delimiter()
+{
+    int i = 0;
+
+    while (!ktok_check_delimiter()) {
+	/* NOTE: can't be eof, because eof is a delimiter */
+	char ch = (char) ktok_getc();
+	ktok_buffer[i++] = ch;
+
+	if (i + 1 == ktok_buffer_size) {
+	    /* TODO: allow buffer to grow */
+	    break;
+	}
+    }
+    ktok_buffer[i] = '\0';
+    return i;
+}
+
+/*
+** Numbers
+** XXX: for now, only fixints in base 10
+*/
+TValue ktok_read_number(bool is_pos) 
+{
+    int32_t res = 0;
+
+    while(!ktok_check_delimiter()) {
+	/* NOTE: can't be eof because it's a delimiter */
+	char ch = (char) ktok_getc();
+	if (!ktok_is_numeric(ch))
+	    return ktok_error("Not a digit found in number");
+	res = res * 10 + ktok_digit_value(ch);
+    }
+
+    if (!is_pos) 
+	res = -res;
+    return i2tv(res);
+}
+
+TValue ktok_read_maybe_signed_numeric() 
+{
+    /* NOTE: can't be eof, it's either '+' or '-' */
+    char ch = (char) ktok_getc();
+    if (ktok_check_delimiter()) {
+	ktok_buffer[0] = ch;
+	ktok_buffer[1] = '\0';
+	return ksymbol_new(ktok_buffer);
+    } else {
+	return ktok_read_number(ch == '+');
+    }
+}
+
+/*
+** Strings
+*/
+TValue ktok_read_string()
+{
+    /* discard opening quote */
+    ktok_getc();
+
+    bool done = false;
+    int i = 0;
+
+    while(!done) {
+	int chi = ktok_getc();
+	char ch = (char) chi;
+
+	if (chi == EOF)
+	    return ktok_error("EOF found while reading a string");
+	if (ch == '"') {
+	    ktok_buffer[i] = '\0';
+	    done = true;
+	} else {
+	    if (ch == '\\') {
+		chi = ktok_getc();
+	
+		if (chi == EOF)
+		    return ktok_error("EOF found while reading a string");
+
+		ch = (char) chi;
+
+		if (ch != '\\' && ch != '"') {
+		    return ktok_error("Invalid char after '\\' " 
+				      "while reading a string");
+		}
+	    } 
+	    ktok_buffer[i++] = ch;
+		
+	    if (i + 1 == ktok_buffer_size)
+		/* TODO: allow buffer to grow */
+		return ktok_error("Implementation restriction: "
+				  "String too long");
+	}
+    }
+    return kstring_new(ktok_buffer, i);
+}
+
+/*
+** Special constants (starting with "#")
+** (Special number syntax, char constants, #ignore, #inert, srfi-38 tokens) 
+*/
+TValue ktok_read_special()
+{
+    /* discard the '#' */
+    ktok_getc();
+    
+    int chi = ktok_getc();
+    char ch = (char) chi;
+
+    if (chi == EOF)
+	return ktok_error("EOF found while reading a '#' constant");
+
+    switch(ch) {
+    case 'i':
+	/* ignore or inert */
+	/* XXX: could also be an inexact number */
+	ktok_read_until_delimiter();
+	/* NOTE: can use strcmp even in the presence of '\0's */
+	if (strcmp(ktok_buffer, "gnore") == 0)
+	    return KIGNORE;
+	else if (strcmp(ktok_buffer, "nert") == 0)
+	    return KINERT;
+	else
+	    return ktok_error("unexpected char in # constant");
+    case 'e':
+	/* an exact infinity */
+	/* XXX: could also be an exact number */
+	if (ktok_read_until_delimiter()) {
+	    /* NOTE: can use strcmp even in the presence of '\0's */
+	    if (strcmp(ktok_buffer, "+infinity") == 0)
+		return KEPINF;
+	    else if (strcmp(ktok_buffer, "-infinity") == 0)
+		return KEMINF;
+	    else
+		return ktok_error("unexpected char in # constant");
+	} else
+	    return ktok_error("unexpected error in # constant");
+    case 't':
+    case 'f':
+	/* boolean constant */
+	if (ktok_check_delimiter())
+	    return b2tv(ch == 't');
+	else
+	    return ktok_error("unexpected char in # constant");
+    case '\\':
+	/* char constant */
+	/* 
+	** RATIONALE: in the scheme spec (R5RS) it says that only alphabetic 
+	** char constants need a delimiter to disambiguate the cases with 
+	** character names. It would be more consistent if all characters
+	** needed a delimiter (and is probably implied by the yet incomplete
+	** Kernel report (R-1RK))
+	** For now we follow the scheme report 
+	*/
+	chi = ktok_getc();
+	ch = (char) chi;
+
+	if (chi == EOF)
+	    return ktok_error("EOF found while reading a char constant");
+
+	if (!ktok_is_alphabetic(ch) || ktok_check_delimiter())
+	    return ch2tv(ch);
+
+	ktok_read_until_delimiter();
+	char *p = ktok_buffer;
+	while (*p) {
+	    *p = tolower(*p);
+	    p++;
+	}
+	ch = tolower(ch);
+	/* NOTE: can use strcmp even in the presence of '\0's */
+	if (ch == 's' && strcmp(ktok_buffer, "pace") == 0)
+	    return ch2tv(' ');
+	else if (ch == 'n' && strcmp(ktok_buffer, "ewline") == 0)
+	    return ch2tv('\n');
+	else 
+	    return ktok_error("Unrecognized character name");
+    case '0': case '1': case '2': case '3': case '4': 
+    case '5': case '6': case '7': case '8': case '9': {
+	/* srfi-38 type token (can be either a def or ref) */
+	/* TODO: allow bigints */
+	    int32_t res = 0;
+	    while(ch != '#' && ch != '=') {
+		if (!ktok_is_numeric(ch))
+		    return ktok_error("Invalid char found in srfi-38 token");
+
+		res = res * 10 + ktok_digit_value(ch);
+
+		chi = ktok_getc();
+		ch = (char) chi;
+	
+		if (chi == EOF)
+		    return ktok_error("EOF found while reading a srfi-38 token");
+	    }
+	    return kcons(b2tv(ch == '='), i2tv(res));
+	}
+    /* TODO: add real with no primary value and undefined */
+    default:
+	return ktok_error("unexpected char in # constant");
+    }
+}
+
+/*
+** Identifiers
+*/
+TValue ktok_read_identifier()
+{
+    int i = 0;
+
+    while (!ktok_check_delimiter()) {
+	/* NOTE: can't be eof, because eof is a delimiter */
+	char ch = (char) ktok_getc();
+
+	/* NOTE: is_subsequent of '\0' is false, so no embedded '\0' */
+	if (ktok_is_subsequent(ch))
+	    ktok_buffer[i++] = ch;
+	else
+	    return ktok_error("Invalid char in identifier");	    
+
+	/* TODO: allow buffer to grow */
+	if (i + 1 == ktok_buffer_size) {
+	    break;
+	}
+    }
+    ktok_buffer[i] = '\0';
+    return ksymbol_new(ktok_buffer);
+}
+
diff --git a/src/ktoken.h b/src/ktoken.h
@@ -0,0 +1,21 @@
+/*
+** ktoken.h
+** Tokenizer for the Kernel Programming Language
+** See Copyright Notice in klisp.h
+*/
+
+#ifndef ktoken_h
+#define ktoken_h
+
+#include "kobject.h"
+
+/*
+** Tokenizer interface
+*/
+void ktok_init();
+TValue ktok_read_token();
+
+/* TODO: move this to the global state */
+FILE *ktok_file;
+
+#endif

	klisp an open source interpreter for the Kernel Programming Language.
	git clone http://git.hanabi.in/repos/klisp.git
	Log \| Files \| Refs \| README

M	src/Makefile	\|	12	+++++++++---
M	src/klisp.c	\|	54	+++++++++++++++++++-----------------------------------
A	src/kobject.c	\|	46	++++++++++++++++++++++++++++++++++++++++++++++
M	src/kobject.h	\|	123	++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------
A	src/kpair.c	\|	27	+++++++++++++++++++++++++++
A	src/kpair.h	\|	25	+++++++++++++++++++++++++
A	src/kstring.c	\|	32	++++++++++++++++++++++++++++++++
A	src/kstring.h	\|	15	+++++++++++++++
A	src/ksymbol.c	\|	51	+++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/ksymbol.h	\|	19	+++++++++++++++++++
A	src/ktoken.c	\|	541	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/ktoken.h	\|	21	+++++++++++++++++++++